@Override public void finishDestination() { destinationView.setDocumentText(sofaBuilder.toString()); completedAnnotations.forEach(destinationView::addFsToIndexes); }
@Override protected void endDocument(final PDDocument aPdf) throws IOException { cas.setDocumentText(text.toString()); if (log.isTraceEnabled()) { log.trace("</document>"); } }
/** * Inits the cas. */ private final void initCas() { this.cas.setDocumentLanguage(this.language); this.cas.setDocumentText(this.textArea.getText()); }
/** * {@inheritDoc} */ @Override public void getNext(CAS cas) throws IOException, CollectionException { String text = consumeLine(); cas.setDocumentText(text); }
@Nonnull @Override public Document addDocument(@Nonnull String name, @Nonnull String text) { CAS view = cas.createView(name); view.setDocumentText(text); return new CASDocument(view, labelAdapters); }
@Override public void convertValue(Text keyFrom, Text valueFrom, CASWritable valueTo) { CAS cas = valueTo.getCAS(); cas.reset(); Text doc = valueFrom; if (textExtractor != null) doc = textExtractor.extractDocumentText(keyFrom, valueFrom); cas.setDocumentText(doc.toString()); } }
/** * Use the given analysis engine and process the given text * You must release the return cas yourself * @param text the text to rpocess * @return the processed cas */ public CAS process(String text) { CAS cas = retrieve(); cas.setDocumentText(text); try { analysisEngine.process(cas); } catch (AnalysisEngineProcessException e) { if(text != null && !text.isEmpty()) return process(text); throw new RuntimeException(e); } return cas; }
/** * Use the given analysis engine and process the given text * You must release the return cas yourself * @param text the text to rpocess * @return the processed cas */ public CAS process(String text) { CAS cas = retrieve(); cas.setDocumentText(text); try { analysisEngine.process(cas); } catch (AnalysisEngineProcessException e) { if (text != null && !text.isEmpty()) return process(text); throw new RuntimeException(e); } return cas; }
private InputStream getDocument(String fileName, String text, String language, SerialFormat format) { String failedToImportLine = "Failed to import: " + fileName + "\n\n"; CAS cas = createEmtpyCAS(); cas.setDocumentText(removeNonXmlChars(text)); cas.setDocumentLanguage(language); ByteArrayOutputStream out = new ByteArrayOutputStream(40000); try { CasIOUtils.save(cas, out, format); } catch (IOException e) { throw new TaeError(failedToImportLine + e.getMessage(), e); } return new ByteArrayInputStream(out.toByteArray()); }
/** * Use the given analysis engine and process the given text * You must release the return cas yourself * @param text the text to rpocess * @return the processed cas */ public CAS process(String text) { CAS cas = retrieve(); cas.setDocumentText(text); try { analysisEngine.process(cas); } catch (AnalysisEngineProcessException e) { if(text != null && !text.isEmpty()) return process(text); throw new RuntimeException(e); } return cas; }
@Override public void getNext(CAS cas) throws IOException { // Initialize CAS with document meta data initCas(cas, currentFileResource, null); if (!StringUtils.isWhitespace(language)) { cas.setDocumentLanguage(language); } // The buffer where document text is to be stored StringBuilder documentText = new StringBuilder(); Node node = nodes.poll(); if (node != null) { processNode(cas, node, documentText); } // Set document text in cas or error if nothing gets parsed out String documentTextString = documentText.toString(); if (StringUtils.isWhitespace(documentTextString)) { cas.setDocumentText("[Parse error]"); } else { cas.setDocumentText(documentTextString); } }
protected void doProcess(BehemothDocument behemoth, Reporter reporter) throws AnalysisEngineProcessException { // does the input document have a some text? // if not - skip it if (behemoth.getText() == null) { LOG.debug(behemoth.getUrl().toString() + " has null text"); } else { // detect language if specified by user String lang = this.config.get("uima.language", "en"); cas.setDocumentLanguage(lang); cas.setDocumentText(behemoth.getText()); // process it tae.process(cas); convertCASToBehemoth(cas, behemoth, reporter); } }
/** * {@inheritDoc} */ @Override public void getNext(CAS cas) throws IOException, CollectionException { if (!dbIterator.hasNext()) { throw new NoSuchElementException(); } DbTuple tuple = dbIterator.next(); consumedCount++; cas.setDocumentText(tuple.text); try { DocumentMetadata docMeta = new DocumentMetadata(cas.getJCas()); docMeta.setSourceUri(tuple.url); docMeta.addToIndexes(); } catch (CASException e) { throw new CollectionException(e); } }
/** * {@inheritDoc} */ @Override public void getNext(CAS aCAS) throws IOException, CollectionException { if (!hasNext()) { throw new CollectionException(new NoSuchElementException()); } final int curFileIdx = lastReadFileIdx + 1; File file = files.get(curFileIdx); lastReadFileIdx = curFileIdx; // String fileContent = FileUtils.readFileToString(file, encoding); aCAS.setDocumentText(fileContent); try { DocumentMetadata docMeta = new DocumentMetadata(aCAS.getJCas()); docMeta.setSourceUri(getURIForMetadata(file).toString()); docMeta.addToIndexes(); } catch (CASException e) { throw new IllegalStateException(e); } }
@Override public void getNext(CAS aJCas) throws IOException, CollectionException { Resource res = nextFile(); initCas(aJCas, res); try (InputStream is = new BufferedInputStream( CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()))) { String text; if (ENCODING_AUTO.equals(sourceEncoding)) { CharsetDetector detector = new CharsetDetector(); text = IOUtils.toString(detector.getReader(is, null)); } else { text = IOUtils.toString(is, sourceEncoding); } aJCas.setDocumentText(text); } } }
public static void main(String[] args) throws Exception { URL url = TextMarkerEngine.class.getClassLoader().getResource("PlainTextAnnotator.xml"); if (url == null) { url = PlainTextAnnotator.class.getClassLoader().getResource( "org/apache/uima/textmarker/engine/PlainTextAnnotator.xml"); } XMLInputSource in = new XMLInputSource(url); ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in); AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(specifier); CAS cas = ae.newCAS(); cas.setDocumentText(FileUtils.file2String(new File( "D:/work/workspace-textmarker/Test/input/list1.txt"), "UTF-8")); ae.process(cas); AnnotationIndex<AnnotationFS> annotationIndex = cas.getAnnotationIndex(); for (AnnotationFS annotationFS : annotationIndex) { System.out.println(annotationFS.getType().getShortName() + " : " + annotationFS.getCoveredText()); } }
public void runCPM(String text) { cas.setDocumentText(text); cas.setDocumentLanguage(langID); try { ae.process(cas); } catch (AnalysisEngineProcessException e) { e.printStackTrace(); } processCAS(cas); cas.reset(); }
public void process(CAS aCAS) throws AnalysisEngineProcessException { // get handle to CAS view containing XML document CAS xmlCas = aCAS.getView("xmlDocument"); InputStream xmlStream = xmlCas.getSofa().getSofaDataStream(); // parse with detag handler DetagHandler handler = new DetagHandler(); try { SAXParser parser = parserFactory.newSAXParser(); parser.parse(xmlStream, handler); } catch (Exception e) { throw new AnalysisEngineProcessException(e); } // create the plain text view and set its document text CAS plainTextView = aCAS.createView("plainTextDocument"); plainTextView.setDocumentText(handler.getDetaggedText()); plainTextView.setDocumentLanguage(aCAS.getView("_InitialView").getDocumentLanguage()); // Index the SourceDocumentInformation object, if there is one, in the new sofa. // This is needed by the SemanticSearchCasIndexer Iterator iter = xmlCas.getAnnotationIndex(sourceDocInfoType).iterator(); if (iter.hasNext()) { FeatureStructure sourceDocInfoFs = (FeatureStructure) iter.next(); plainTextView.getIndexRepository().addFS(sourceDocInfoFs); } }
/** * analyzes the tokenizer input using the given analysis engine * <p> * {@link #cas} will be filled with extracted metadata (UIMA annotations, feature structures) * * @throws IOException If there is a low-level I/O error. */ protected void analyzeInput() throws ResourceInitializationException, AnalysisEngineProcessException, IOException { if (ae == null) { ae = AEProviderFactory.getInstance().getAEProvider(null, descriptorPath, configurationParameters).getAE(); } if (cas == null) { cas = ae.newCAS(); } else { cas.reset(); } cas.setDocumentText(toString(input)); ae.process(cas); }
CASArtifact( @Nullable LabelAdapters labelAdapters, CAS cas, String artifactID ) { this.labelAdapters = labelAdapters; this.cas = cas; TypeSystem typeSystem = cas.getTypeSystem(); metadataType = typeSystem.getType("ArtifactMetadata"); keyFeature = metadataType.getFeatureByBaseName("key"); valueFeature = metadataType.getFeatureByBaseName("value"); metadataCas = cas.createView("metadata"); metadataCas.setDocumentText(""); Type idType = typeSystem.getType("ArtifactID"); Feature idFeat = idType.getFeatureByBaseName("artifactID"); this.artifactID = artifactID; FeatureStructure documentIdFs = metadataCas.createFS(idType); documentIdFs.setStringValue(idFeat, artifactID); metadataCas.addFsToIndexes(documentIdFs); metadataIndex = metadataCas.getIndexRepository().getIndex("metadata", metadataType); casMetadata = new CASMetadata(); }