public CAS getUsedCas(CAS cas) throws CollectionException, IOException, ResourceInitializationException { cas.reset(); cr.getNext(cas); return cas; }
@Override public void convertValue(Text keyFrom, Text valueFrom, CASWritable valueTo) { CAS cas = valueTo.getCAS(); cas.reset(); Text doc = valueFrom; if (textExtractor != null) doc = textExtractor.extractDocumentText(keyFrom, valueFrom); cas.setDocumentText(doc.toString()); } }
private Map<String, File> getUriToXmiFileMap() throws ResourceInitializationException, IOException, SAXException { if (uriToXmiFileMap == null) { log.info("Scanning {} XMIs for document URIs...", dir); uriToXmiFileMap = Maps.newHashMap(); CAS wrkCas = createCas(); for (final File xmiFile : getXmiFiles()) { deserialize(xmiFile, wrkCas); String docURI = docMetaExtractor.getDocumentUri(wrkCas); final File prevFile; if ((prevFile = uriToXmiFileMap.put(docURI, xmiFile)) != null) { throw new IllegalStateException( String.format( "There are at least 2 files which metadata has the same URI '%s':\n%s\n%s", docURI, prevFile, xmiFile)); } wrkCas.reset(); } log.info("Scanning {} XMIs for document URIs is finished", dir); } return uriToXmiFileMap; }
public BehemothDocument[] process(BehemothDocument behemoth, Reporter reporter) { if (reporter != null) reporter.setStatus("UIMA : " + behemoth.getUrl().toString()); // generate a CAS from the input document cas.reset(); try { doProcess(behemoth, reporter); } catch (Exception e) { if (reporter != null) reporter.incrCounter("UIMA", "Exception", 1); LOG.error(behemoth.getUrl().toString(), e); } if (reporter != null) reporter.incrCounter("UIMA", "Document", 1); // return the modified document return new BehemothDocument[] { behemoth }; }
public void map(Text id, BehemothDocument behemoth, OutputCollector<Text, BehemothDocument> output, Reporter reporter) throws IOException { reporter.setStatus("UIMA : " + id.toString()); // generate a CAS from the input document cas.reset(); try { doProcess(behemoth, reporter); } catch (Exception e) { reporter.incrCounter("UIMA", "Exception", 1); throw new IOException(e); } reporter.incrCounter("UIMA", "Document", 1); // dump the modified document output.collect(id, behemoth); }
void runPipeline(final CollectionReader reader, AnalysisEngine engine, CasProcessedCallback callback) throws UIMAException, IOException { final List<ResourceMetaData> metaData = new ArrayList<ResourceMetaData>(); metaData.add(reader.getMetaData()); metaData.add(engine.getMetaData()); final CAS cas = CasCreationUtils.createCas(metaData); try { while (reader.hasNext()) { reader.getNext(cas); runPipeline(cas, engine, callback); cas.reset(); } } finally { SimplePipelineRev803.collectionProcessComplete(engine); SimplePipelineRev803.destroy(reader); } }
private void run() throws ResourceInitializationException, IOException, SAXException { Collection<File> inputFiles = FileUtils.listFiles(inputDir, FileFilterUtils.suffixFileFilter(".xml"), null); if (inputFiles.isEmpty()) { return; } TypeSystemDescription tsd = TypeSystemDescriptionFactory .createTypeSystemDescription(typeSystemDescName); CAS cas = CasCreationUtils.createCas(tsd, null, null); for (File inputFile : inputFiles) { AXMLReader.read(inputFile, cas); File outFile = getOutputFile(inputFile); OutputStream out = FileUtils.openOutputStream(outFile); try { XmiCasSerializer.serialize(cas, null, out, true, null); } finally { out.close(); cas.reset(); } } }
public void runCPM(String text) { cas.setDocumentText(text); cas.setDocumentLanguage(langID); try { ae.process(cas); } catch (AnalysisEngineProcessException e) { e.printStackTrace(); } processCAS(cas); cas.reset(); }
public PosUimaTokenizer(String tokens, AnalysisEngine engine, Collection<String> allowedPosTags) { if (engine == null) PosUimaTokenizer.engine = engine; this.allowedPosTags = allowedPosTags; this.tokens = new ArrayList<>(); try { if (cas == null) cas = engine.newCAS(); cas.reset(); cas.setDocumentText(tokens); PosUimaTokenizer.engine.process(cas); for (Sentence s : JCasUtil.select(cas.getJCas(), Sentence.class)) { for (Token t : JCasUtil.selectCovered(Token.class, s)) { //add NONE for each invalid token if (valid(t)) if (t.getLemma() != null) this.tokens.add(t.getLemma()); else if (t.getStem() != null) this.tokens.add(t.getStem()); else this.tokens.add(t.getCoveredText()); else this.tokens.add("NONE"); } } } catch (Exception e) { throw new RuntimeException(e); } }
public PosUimaTokenizer(String tokens,AnalysisEngine engine,Collection<String> allowedPosTags) { if(engine == null) PosUimaTokenizer.engine = engine; this.allowedPosTags = allowedPosTags; this.tokens = new ArrayList<>(); try { if(cas == null) cas = engine.newCAS(); cas.reset(); cas.setDocumentText(tokens); PosUimaTokenizer.engine.process(cas); for(Sentence s : JCasUtil.select(cas.getJCas(), Sentence.class)) { for(Token t : JCasUtil.selectCovered(Token.class,s)) { //add NONE for each invalid token if(valid(t)) if(t.getLemma() != null) this.tokens.add(t.getLemma()); else if(t.getStem() != null) this.tokens.add(t.getStem()); else this.tokens.add(t.getCoveredText()); else this.tokens.add("NONE"); } } } catch (Exception e) { throw new RuntimeException(e); } }
public PosUimaTokenizer(String tokens,AnalysisEngine engine,Collection<String> allowedPosTags) { if(engine == null) PosUimaTokenizer.engine = engine; this.allowedPosTags = allowedPosTags; this.tokens = new ArrayList<>(); try { if(cas == null) cas = engine.newCAS(); cas.reset(); cas.setDocumentText(tokens); PosUimaTokenizer.engine.process(cas); for(Sentence s : JCasUtil.select(cas.getJCas(), Sentence.class)) { for(Token t : JCasUtil.selectCovered(Token.class,s)) { //add NONE for each invalid token if(valid(t)) if(t.getLemma() != null) this.tokens.add(t.getLemma()); else if(t.getStem() != null) this.tokens.add(t.getStem()); else this.tokens.add(t.getCoveredText()); else this.tokens.add("NONE"); } } } catch (Exception e) { throw new RuntimeException(e); } }
public static SummaryStatistics measureReadPerformance( CollectionReaderDescription aReaderDesc, JCas aJCas, int aIterations) throws ResourceInitializationException, CollectionException, IOException, ResourceConfigurationException { CollectionReader reader = createReader(aReaderDesc); SummaryStatistics stats = new SummaryStatistics(); CAS cas = aJCas.getCas(); for (int i = 0; i < aIterations; i++) { long begin = System.currentTimeMillis(); reader.getNext(cas); stats.addValue(System.currentTimeMillis() - begin); reader.reconfigure(); cas.reset(); } reader.close(); reader.destroy(); return stats; }
reader.getNext(cas); runPipeline(cas, engines); cas.reset();
public CAS getEmptyCas() throws ResourceInitializationException { // Reuse same CAS for each request if (cas != null) { cas.reset(); } else { TypePriorities ae_tp = ae.getProcessingResourceMetaData().getTypePriorities(); TypeSystemDescription ae_tsd = ae.getProcessingResourceMetaData().getTypeSystem(); FsIndexDescription[] ae_fid = ae.getProcessingResourceMetaData().getFsIndexes(); // Use class level locking to serialize access to CasCreationUtils // Only one thread at the time can create a CAS. UIMA uses lazy // initialization approach which can cause NPE when two threads // attempt to initialize a CAS. synchronized( CasCreationUtils.class) { cas = CasCreationUtils.createCas(ae_tsd, ae_tp, ae_fid); } } return cas; }
/** * analyzes the tokenizer input using the given analysis engine * <p> * {@link #cas} will be filled with extracted metadata (UIMA annotations, feature structures) * * @throws IOException If there is a low-level I/O error. */ protected void analyzeInput() throws ResourceInitializationException, AnalysisEngineProcessException, IOException { if (ae == null) { ae = AEProviderFactory.getInstance().getAEProvider(null, descriptorPath, configurationParameters).getAE(); } if (cas == null) { cas = ae.newCAS(); } else { cas.reset(); } cas.setDocumentText(toString(input)); ae.process(cas); }
cas.reset(); cas.release();
/** * Load xmi file. * * @param xmiCasFile the xmi cas file */ public void loadXmiFile(File xmiCasFile) { try { setXcasFileOpenDir(xmiCasFile.getParentFile()); Timer time = new Timer(); time.start(); SAXParserFactory saxParserFactory = XMLUtils.createSAXParserFactory(); SAXParser parser = saxParserFactory.newSAXParser(); XmiCasDeserializer xmiCasDeserializer = new XmiCasDeserializer(getCas().getTypeSystem()); getCas().reset(); parser.parse(xmiCasFile, xmiCasDeserializer.getXmiCasHandler(getCas(), true)); time.stop(); handleSofas(); setTitle("XMI CAS"); updateIndexTree(true); setRunOnCasEnabled(); setEnableCasFileReadingAndWriting(); setStatusbarMessage("Done loading XMI CAS file in " + time.getTimeSpan() + "."); } catch (Exception e) { e.printStackTrace(); handleException(e); } }
@Test public void test() throws Exception { CollectionReader reader = CollectionReaderFactory.createCollectionReader( TextFileReader.class, createTypeSystemDescription(), TextFileReader.PARAM_PATH, "src/test/resources/textfiles", TextFileReader.PARAM_LANGUAGE, "Latin"); int found = 0; CAS cas = CasCreationUtils.createCas(reader.getProcessingResourceMetaData()); while (reader.hasNext()) { reader.getNext(cas); DocumentMetaData meta = iterate(cas.getJCas(), DocumentMetaData.class).iterator().next(); for (Entry<String, String> entry : testFileContent.entrySet()) { if (meta.getDocumentUri().endsWith(entry.getKey())) { assertEquals(entry.getValue(), cas.getDocumentText()); found++; } } cas.reset(); } assertEquals(testFileContent.size(), found); } }
reader.getNext(cas); aae.process(cas); cas.reset();