@Test public void testMetadata() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TearlineContentExtractor(); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); File f = new File(getClass().getResource("tearline/1.docx").getPath()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); assertFalse(JCasUtil.select(jCas, Metadata.class).isEmpty()); } contentExtractor.destroy(); }
@Test public void testTikaText() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TikaContentExtractor(); File f = new File(getClass().getResource("test.txt").getPath()); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); } contentExtractor.destroy(); assertEquals("Hello World\n", jCas.getDocumentText()); assertEquals(4, JCasUtil.select(jCas, Metadata.class).size()); }
@Test public void testTikaCorruptFile() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TikaContentExtractor(); File f = new File(getClass().getResource("corrupt.docx").getPath()); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); } contentExtractor.destroy(); assertEquals(TikaContentExtractor.CORRUPT_FILE_TEXT, jCas.getDocumentText()); } }
@Test public void testNotEnoughCols() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new CsvContentExtractor(); File f = new File(getClass().getResource("test.csv").getPath()); Map<String, Object> config = new HashMap<>(); config.put(CsvContentExtractor.PARAM_SEPARATOR, ","); config.put(CsvContentExtractor.PARAM_CONTENT_COLUMN, "20"); config.put(CsvContentExtractor.PARAM_COLUMNS, Arrays.asList("id", "test1", "", "test3")); contentExtractor.initialize(new CustomResourceSpecifier_impl(), config); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); fail("Expected error not thrown"); } catch (IOException ioe) { // This error is expected } contentExtractor.destroy(); } }
@Test public void testTikaWrappingDocx() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TikaContentExtractor(); File f = new File(getClass().getResource("wrappingLines.docx").getPath()); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); } contentExtractor.destroy(); assertEquals( "Test Document\nThis is my test document, which has a sentence that is long enough to wrap over two lines but we want it to appear as a single line when we extract the content.\nThis is a second paragraph. This is a third sentence, but still the second paragraph. Super-cali-fragi-listic-expi-alo-docious.\n", jCas.getDocumentText()); }
@Test public void testNoTearline() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TearlineContentExtractor(); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); File f = new File(getClass().getResource("tearline/notearline.docx").getPath()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); assertEquals("This document has no tearline.", jCas.getDocumentText()); jCas.reset(); } contentExtractor.destroy(); }
@Test public void testCustomTearline() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); Map<String, Object> params = new HashMap<>(); params.put("tearline", "Customer Form:"); BaleenContentExtractor contentExtractor = new TearlineContentExtractor(); contentExtractor.initialize(new CustomResourceSpecifier_impl(), params); File f = new File(getClass().getResource("tearline/customtearline.docx").getPath()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); assertEquals("This is the first tearline.", jCas.getDocumentText()); jCas.reset(); } contentExtractor.destroy(); } }
@Test public void testBoilerplate() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); Map<String, Object> params = new HashMap<>(); params.put("boilerplate", new String[] {"[aeiou]"}); BaleenContentExtractor contentExtractor = new TearlineContentExtractor(); contentExtractor.initialize(new CustomResourceSpecifier_impl(), params); File f = new File(getClass().getResource("tearline/notearline.docx").getPath()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); assertEquals("Ths dcmnt hs n trln.", jCas.getDocumentText()); jCas.reset(); } contentExtractor.destroy(); }
@Test public void testTearline() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TearlineContentExtractor(); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); String[] files = new String[] {"1.docx", "2.docx", "3.docx", "4.docx", "5.doc", "6.pdf"}; for (String file : files) { File f = new File(getClass().getResource("tearline/" + file).getPath()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); assertEquals("This is the first tearline.", jCas.getDocumentText()); jCas.reset(); } } contentExtractor.destroy(); }
@Test public void testTikaWord() throws Exception { JCas jCas = JCasSingleton.getJCasInstance(); BaleenContentExtractor contentExtractor = new TikaContentExtractor(); File f = new File(getClass().getResource("test.docx").getPath()); contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); try (InputStream is = new FileInputStream(f); ) { contentExtractor.processStream(is, f.getPath(), jCas); } contentExtractor.destroy(); assertEquals( "Test Document\nThis is a simple test document, with a title and a single sentence.\n", jCas.getDocumentText()); Collection<Metadata> metadata = JCasUtil.select(jCas, Metadata.class); assertEquals(44, metadata.size()); Map<String, String> metadataMap = new HashMap<>(); for (Metadata md : metadata) { metadataMap.put(md.getKey(), md.getValue()); } assertTrue(metadataMap.containsKey("Page-Count")); assertEquals("1", metadataMap.get("Page-Count")); assertTrue(metadataMap.containsKey("meta:author")); assertEquals("James Baker", metadataMap.get("meta:author")); }
contentExtractor.processStream(is, f.getPath(), jCas); contentExtractor.destroy();