uk.gov.dstl.baleen.uima.BaleenContentExtractor.destroy java code examples

@Test
public void testMetadata() throws Exception {
 JCas jCas = JCasSingleton.getJCasInstance();
 BaleenContentExtractor contentExtractor = new TearlineContentExtractor();
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap());
 File f = new File(getClass().getResource("tearline/1.docx").getPath());
 try (InputStream is = new FileInputStream(f); ) {
  contentExtractor.processStream(is, f.getPath(), jCas);
  assertFalse(JCasUtil.select(jCas, Metadata.class).isEmpty());
 }
 contentExtractor.destroy();
}

@Test
public void testTikaText() throws Exception {
 JCas jCas = JCasSingleton.getJCasInstance();
 BaleenContentExtractor contentExtractor = new TikaContentExtractor();
 File f = new File(getClass().getResource("test.txt").getPath());
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap());
 try (InputStream is = new FileInputStream(f); ) {
  contentExtractor.processStream(is, f.getPath(), jCas);
 }
 contentExtractor.destroy();
 assertEquals("Hello World\n", jCas.getDocumentText());
 assertEquals(4, JCasUtil.select(jCas, Metadata.class).size());
}

 @Test
 public void testTikaCorruptFile() throws Exception {
  JCas jCas = JCasSingleton.getJCasInstance();

  BaleenContentExtractor contentExtractor = new TikaContentExtractor();

  File f = new File(getClass().getResource("corrupt.docx").getPath());

  contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap());
  try (InputStream is = new FileInputStream(f); ) {
   contentExtractor.processStream(is, f.getPath(), jCas);
  }
  contentExtractor.destroy();

  assertEquals(TikaContentExtractor.CORRUPT_FILE_TEXT, jCas.getDocumentText());
 }
}

 @Test
 public void testNotEnoughCols() throws Exception {
  JCas jCas = JCasSingleton.getJCasInstance();

  BaleenContentExtractor contentExtractor = new CsvContentExtractor();

  File f = new File(getClass().getResource("test.csv").getPath());

  Map<String, Object> config = new HashMap<>();
  config.put(CsvContentExtractor.PARAM_SEPARATOR, ",");
  config.put(CsvContentExtractor.PARAM_CONTENT_COLUMN, "20");
  config.put(CsvContentExtractor.PARAM_COLUMNS, Arrays.asList("id", "test1", "", "test3"));

  contentExtractor.initialize(new CustomResourceSpecifier_impl(), config);
  try (InputStream is = new FileInputStream(f); ) {
   contentExtractor.processStream(is, f.getPath(), jCas);
   fail("Expected error not thrown");
  } catch (IOException ioe) {
   // This error is expected
  }
  contentExtractor.destroy();
 }
}

@Test
public void testTikaWrappingDocx() throws Exception {
 JCas jCas = JCasSingleton.getJCasInstance();
 BaleenContentExtractor contentExtractor = new TikaContentExtractor();
 File f = new File(getClass().getResource("wrappingLines.docx").getPath());
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap());
 try (InputStream is = new FileInputStream(f); ) {
  contentExtractor.processStream(is, f.getPath(), jCas);
 }
 contentExtractor.destroy();
 assertEquals(
   "Test Document\nThis is my test document, which has a sentence that is long enough to wrap over two lines but we want it to appear as a single line when we extract the content.\nThis is a second paragraph. This is a third sentence, but still the second paragraph. Super-cali-fragi-listic-expi-alo-docious.\n",
   jCas.getDocumentText());
}

@Test
public void testNoTearline() throws Exception {
 JCas jCas = JCasSingleton.getJCasInstance();
 BaleenContentExtractor contentExtractor = new TearlineContentExtractor();
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap());
 File f = new File(getClass().getResource("tearline/notearline.docx").getPath());
 try (InputStream is = new FileInputStream(f); ) {
  contentExtractor.processStream(is, f.getPath(), jCas);
  assertEquals("This document has no tearline.", jCas.getDocumentText());
  jCas.reset();
 }
 contentExtractor.destroy();
}

 @Test
 public void testCustomTearline() throws Exception {
  JCas jCas = JCasSingleton.getJCasInstance();

  Map<String, Object> params = new HashMap<>();
  params.put("tearline", "Customer Form:");

  BaleenContentExtractor contentExtractor = new TearlineContentExtractor();
  contentExtractor.initialize(new CustomResourceSpecifier_impl(), params);

  File f = new File(getClass().getResource("tearline/customtearline.docx").getPath());

  try (InputStream is = new FileInputStream(f); ) {
   contentExtractor.processStream(is, f.getPath(), jCas);
   assertEquals("This is the first tearline.", jCas.getDocumentText());

   jCas.reset();
  }
  contentExtractor.destroy();
 }
}

@Test
public void testBoilerplate() throws Exception {
 JCas jCas = JCasSingleton.getJCasInstance();
 Map<String, Object> params = new HashMap<>();
 params.put("boilerplate", new String[] {"[aeiou]"});
 BaleenContentExtractor contentExtractor = new TearlineContentExtractor();
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), params);
 File f = new File(getClass().getResource("tearline/notearline.docx").getPath());
 try (InputStream is = new FileInputStream(f); ) {
  contentExtractor.processStream(is, f.getPath(), jCas);
  assertEquals("Ths dcmnt hs n trln.", jCas.getDocumentText());
  jCas.reset();
 }
 contentExtractor.destroy();
}

@Test
public void testTearline() throws Exception {
 JCas jCas = JCasSingleton.getJCasInstance();
 BaleenContentExtractor contentExtractor = new TearlineContentExtractor();
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap());
 String[] files = new String[] {"1.docx", "2.docx", "3.docx", "4.docx", "5.doc", "6.pdf"};
 for (String file : files) {
  File f = new File(getClass().getResource("tearline/" + file).getPath());
  try (InputStream is = new FileInputStream(f); ) {
   contentExtractor.processStream(is, f.getPath(), jCas);
   assertEquals("This is the first tearline.", jCas.getDocumentText());
   jCas.reset();
  }
 }
 contentExtractor.destroy();
}

@Test
public void testTikaWord() throws Exception {
 JCas jCas = JCasSingleton.getJCasInstance();
 BaleenContentExtractor contentExtractor = new TikaContentExtractor();
 File f = new File(getClass().getResource("test.docx").getPath());
 contentExtractor.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap());
 try (InputStream is = new FileInputStream(f); ) {
  contentExtractor.processStream(is, f.getPath(), jCas);
 }
 contentExtractor.destroy();
 assertEquals(
   "Test Document\nThis is a simple test document, with a title and a single sentence.\n",
   jCas.getDocumentText());
 Collection<Metadata> metadata = JCasUtil.select(jCas, Metadata.class);
 assertEquals(44, metadata.size());
 Map<String, String> metadataMap = new HashMap<>();
 for (Metadata md : metadata) {
  metadataMap.put(md.getKey(), md.getValue());
 }
 assertTrue(metadataMap.containsKey("Page-Count"));
 assertEquals("1", metadataMap.get("Page-Count"));
 assertTrue(metadataMap.containsKey("meta:author"));
 assertEquals("James Baker", metadataMap.get("meta:author"));
}

 contentExtractor.processStream(is, f.getPath(), jCas);
contentExtractor.destroy();

Popular methods of BaleenContentExtractor

addToJCasIndex
Add an annotation to the JCas index, notifying UimaMonitor of the fact we have done so
createMonitor
createSupport
doDestroy
Called when the content extractor has finished and is closing down. Any open resources, for example,
doInitialize
Called when the content extractor is being initialized. Any required resources, for example, should
doProcessStream
Called when the content extractor is being asked to process an inputstream and extract the content.
getSupport
Gets the UimaSupport object associated with this ContentExtractor, for instance to log errors.
getUimaContext
initialize
processStream

Popular in Java

Reactive rest calls using spring rest template
compareTo (BigDecimal)
scheduleAtFixedRate (ScheduledExecutorService)
addToBackStack (FragmentTransaction)
BufferedReader (java.io)
Wraps an existing Reader and buffers the input. Expensive interaction with the underlying reader is
File (java.io)
An "abstract" representation of a file system entity identified by a pathname. The pathname may be a
URLEncoder (java.net)
This class is used to encode a string using the format required by application/x-www-form-urlencoded
TimerTask (java.util)
The TimerTask class represents a task to run at a specified time. The task may be run once or repeat
ThreadPoolExecutor (java.util.concurrent)
An ExecutorService that executes each submitted task using one of possibly several pooled threads, n
Color (java.awt)
The Color class is used to encapsulate colors in the default sRGB color space or colors in arbitrary
Top PhpStorm plugins

How to use destroymethodin uk.gov.dstl.baleen.uima.BaleenContentExtractor

Best Java code snippets using uk.gov.dstl.baleen.uima.BaleenContentExtractor.destroy (Showing top 11 results out of 315)

How to use
destroy
method
in
uk.gov.dstl.baleen.uima.BaleenContentExtractor