@Override protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { doProcess(StructureHierarchy.build(jCas, structuralClasses)); }
/** * Build the covering structure hierarchy for the given jCas, using only the structural classes * provided. * * <p>The structure is built by first using the offset of the Structure annotation and then using * the depth. * * @param jCas the jCas * @param structuralClasses the structural classes * @return the StructureHierachy */ public static CoveringStructureHierarchy build( JCas jCas, Set<Class<? extends Structure>> structuralClasses) { Node<Structure> root = StructureHierarchy.build(jCas, structuralClasses).getRoot(); Map<Annotation, Collection<Structure>> covering = buildCovering(jCas, structuralClasses); return new CoveringStructureHierarchy(root, covering); }
/** * Build the covering structure hierarchy for the given jCas, using only the structural classes * provided. * * <p>The structure is built by first using the offset of the Structure annotation and then using * the depth. * * @param jCas the jCas * @param structuralClasses the structural classes * @return the StructureHierachy */ public static CoveringStructureHierarchy build( JCas jCas, Set<Class<? extends Structure>> structuralClasses) { Node<Structure> root = StructureHierarchy.build(jCas, structuralClasses).getRoot(); Map<Annotation, Collection<Structure>> covering = buildCovering(jCas, structuralClasses); return new CoveringStructureHierarchy(root, covering); }
@Override protected void writeBody(final JCas jCas, final Element body) { final Node<Structure> root = StructureHierarchy.build(jCas, structuralClasses).getRoot(); walk(body, root); // We need to create the proper li tags under ol and ul body.select("ul > p").wrap("<li></li>"); body.select("ol > p").wrap("<li></li>"); // Correct table cells from td to th in header body.select("thead td").tagName("th"); // Add to any empty td or th's body.select("td:empty,th:empty").html(" "); if (!outputEmptyTags) { Elements e = emptyElements(body); while (!e.isEmpty()) { e.remove(); e = emptyElements(body); } } // TODO: In accordance with HTML spec // - Captions for Table should be moved inside the table // - Captions for Figure should be moved inside the figure }
public Node<Structure> createStructure(String html) throws UIMAException { JCas jCas = JCasSingleton.getJCasInstance(); converter.apply(Jsoup.parse(html), jCas); return StructureHierarchy.build(jCas, structuralClasses).getRoot(); } }
@Test public void testSelectSimple() throws UIMAException, InvalidParameterException { Paragraph paragraph = new Paragraph(jCas); paragraph.setBegin(0); paragraph.setDepth(1); paragraph.setEnd(TEXT.length()); paragraph.addToIndexes(); RecordStructureManager manager = new RecordStructureManager(StructureHierarchy.build(jCas, structuralClasses)); Optional<Structure> select = manager.select("Paragraph"); assertTrue(select.isPresent()); assertEquals(paragraph, select.get()); }
@Before public void setUp() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); jCas.setDocumentText(TEXT); addAnnotations(jCas); recordStructureManager = new RecordStructureManager( StructureHierarchy.build(jCas, StructureUtil.getStructureClasses())); }
new RecordStructureManager(StructureHierarchy.build(jCas, structuralClasses));
@Test public void testSelectNthTwo() throws InvalidParameterException { Paragraph paragraph1 = new Paragraph(jCas); paragraph1.setBegin(0); paragraph1.setDepth(1); paragraph1.setEnd(20); paragraph1.addToIndexes(); Paragraph paragraph2 = new Paragraph(jCas); paragraph2.setBegin(20); paragraph2.setDepth(1); paragraph2.setEnd(TEXT.length()); paragraph2.addToIndexes(); RecordStructureManager manager = new RecordStructureManager(StructureHierarchy.build(jCas, structuralClasses)); Optional<Structure> select1 = manager.select("Paragraph:nth-of-type(1)"); assertTrue(select1.isPresent()); assertEquals(paragraph1, select1.get()); assertNotEquals(paragraph2, select1.get()); Optional<Structure> select2 = manager.select("Paragraph:nth-of-type(2)"); assertTrue(select2.isPresent()); assertEquals(paragraph2, select2.get()); assertNotEquals(paragraph1, select2.get()); Optional<Structure> select3 = manager.select("Paragraph:nth-of-type(3)"); assertFalse(select3.isPresent()); }
@Test public void testGenerateNested() { Section section = new Section(jCas); section.setBegin(0); section.setDepth(1); section.setEnd(TEXT.length()); section.addToIndexes(); Paragraph paragraph = new Paragraph(jCas); paragraph.setBegin(0); paragraph.setDepth(2); paragraph.setEnd(TEXT.length()); paragraph.addToIndexes(); ItemHierarchy<Structure> structureHierarchy = StructureHierarchy.build(jCas, structuralClasses); SelectorPath path1 = structureHierarchy.getSelectorPath(paragraph); assertEquals("Section:nth-of-type(1) > Paragraph:nth-of-type(1)", path1.toString()); }
@Test public void testGenerateSimple() { Paragraph paragraph1 = new Paragraph(jCas); paragraph1.setBegin(0); paragraph1.setDepth(1); paragraph1.setEnd(20); paragraph1.addToIndexes(); ItemHierarchy<Structure> structureHierarchy = StructureHierarchy.build(jCas, structuralClasses); SelectorPath path = structureHierarchy.getSelectorPath(paragraph1); assertEquals("Paragraph:nth-of-type(1)", path.toString()); }
@Test public void testGenerateTwo() { Paragraph paragraph1 = new Paragraph(jCas); paragraph1.setBegin(0); paragraph1.setDepth(1); paragraph1.setEnd(20); paragraph1.addToIndexes(); Paragraph paragraph2 = new Paragraph(jCas); paragraph2.setBegin(20); paragraph2.setDepth(1); paragraph2.setEnd(TEXT.length()); paragraph2.addToIndexes(); ItemHierarchy<Structure> structureHierarchy = StructureHierarchy.build(jCas, structuralClasses); SelectorPath path1 = structureHierarchy.getSelectorPath(paragraph1); SelectorPath path2 = structureHierarchy.getSelectorPath(paragraph2); assertEquals("Paragraph:nth-of-type(1)", path1.toString()); assertEquals("Paragraph:nth-of-type(2)", path2.toString()); }
@Test public void testGenerateNested2() { Section section = new Section(jCas); section.setBegin(0); section.setDepth(1); section.setEnd(TEXT.length()); section.addToIndexes(); Paragraph paragraph1 = new Paragraph(jCas); paragraph1.setBegin(0); paragraph1.setDepth(2); paragraph1.setEnd(20); paragraph1.addToIndexes(); Paragraph paragraph2 = new Paragraph(jCas); paragraph2.setBegin(20); paragraph2.setDepth(2); paragraph2.setEnd(TEXT.length()); paragraph2.addToIndexes(); ItemHierarchy<Structure> structureHierarchy = StructureHierarchy.build(jCas, structuralClasses); SelectorPath path1 = structureHierarchy.getSelectorPath(paragraph1); SelectorPath path2 = structureHierarchy.getSelectorPath(paragraph2); assertEquals("Section:nth-of-type(1) > Paragraph:nth-of-type(1)", path1.toString()); assertEquals("Section:nth-of-type(1) > Paragraph:nth-of-type(2)", path2.toString()); }
@Test public void testGenerateNestedToDepth1() { Section section = new Section(jCas); section.setBegin(0); section.setDepth(1); section.setEnd(TEXT.length()); section.addToIndexes(); Paragraph paragraph1 = new Paragraph(jCas); paragraph1.setBegin(0); paragraph1.setDepth(2); paragraph1.setEnd(20); paragraph1.addToIndexes(); Paragraph paragraph2 = new Paragraph(jCas); paragraph2.setBegin(20); paragraph2.setDepth(2); paragraph2.setEnd(TEXT.length()); paragraph2.addToIndexes(); ItemHierarchy<Structure> structureHierarchy = StructureHierarchy.build(jCas, structuralClasses); SelectorPath path1 = structureHierarchy.getSelectorPath(paragraph1); SelectorPath path2 = structureHierarchy.getSelectorPath(paragraph1); assertEquals("Section:nth-of-type(1)", path1.toDepth(1).toString()); assertEquals("Section:nth-of-type(1)", path2.toDepth(1).toString()); }
document.addToIndexes(); root = StructureHierarchy.build(jCas, StructureUtil.getStructureClasses()).getRoot();
paragraph2.addToIndexes(); ItemHierarchy<Structure> structureHierarchy = StructureHierarchy.build(jCas, structuralClasses); SelectorPath path1 = structureHierarchy.getSelectorPath(paragraph1); assertEquals("Section:nth-of-type(1) > Paragraph:nth-of-type(1)", path1.toString());