Refine search
/** * Command line extractor, so people will stop moaning that they can't just * run this. */ public static void main( String[] args ) throws IOException { if ( args.length == 0 ) { System.err.println( "Use:" ); System.err .println( " java org.apache.poi.hwpf.extractor.WordExtractor <filename>" ); System.exit( 1 ); } // Process the first argument as a file FileInputStream fin = new FileInputStream( args[0] ); WordExtractor extractor = new WordExtractor( fin ); System.out.println( extractor.getText() ); }
FileInputStream fis = new FileInputStream(file.getAbsolutePath()); HWPFDocument document = new HWPFDocument(fis); extractor = new WordExtractor(document); String[] fileData = extractor.getParagraphText(); for (int i = 0; i < fileData.length; i++)
/** * Command line extractor, so people will stop moaning that they can't just * run this. */ public static void main( String[] args ) throws IOException { if ( args.length == 0 ) { System.err.println( "Use:" ); System.err .println( " java org.apache.poi.hwpf.extractor.WordExtractor <filename>" ); System.exit( 1 ); } // Process the first argument as a file InputStream fin = new FileInputStream( args[0] ); WordExtractor extractor = new WordExtractor( fin ); try { System.out.println( extractor.getText() ); } finally { extractor.close(); } }
HWPFDocument document; try { document = new HWPFDocument(root); } catch (org.apache.poi.EncryptedDocumentException e) { throw new EncryptedDocumentException(e); new org.apache.poi.hwpf.extractor.WordExtractor(document); PicturesTable pictureTable = document.getPicturesTable(); PicturesSource pictures = new PicturesSource(document); HeaderStories headerFooter = null; Range r = document.getRange(); ListManager listManager = new ListManager(document); for (int i = 0; i < r.numParagraphs(); i++) { for (String paragraph : wordExtractor.getMainTextboxText()) { xhtml.element("p", paragraph); for (String paragraph : wordExtractor.getFootnoteText()) { xhtml.element("p", paragraph); for (String paragraph : wordExtractor.getCommentsText()) { xhtml.element("p", paragraph); for (String paragraph : wordExtractor.getEndnoteText()) { xhtml.element("p", paragraph);
/** * {@inheritDoc} */ @Override protected String extractText(POIFSFileSystem poiFs, long filesize, ContentParserOptions options) throws Exception { // DocumentEntry documentEntry = (DocumentEntry) // poiFs.getRoot().getEntry(POIFS_WORD_DOC); // DocumentInputStream documentInputStream = // poiFs.createDocumentInputStream(POIFS_ENTRY); WordExtractor extractor = new WordExtractor(poiFs); return extractor.getText(); }
WordExtractor extractor = new WordExtractor(document); paragraphs.addAll(Arrays.asList(extractor.getParagraphText()) ); footnotes.addAll(Arrays.asList(extractor.getFootnoteText()) ); extractor.close();
public static void readDocxFile(String fileName) { try { File file = new File(fileName); POIFSFileSystem fs = null; fs = new POIFSFileSystem(new FileInputStream(file.getAbsolutePath())); HWPFDocument doc = new HWPFDocument(fs); readParagraphs(doc); } catch (Exception e) { e.printStackTrace(); } } public static void readParagraphs(HWPFDocument doc) throws Exception{ WordExtractor we = new WordExtractor(doc); /**Get the total number of paragraphs**/ String[] paragraphs = we.getParagraphText(); System.out.println("Total Paragraphs: "+paragraphs.length); for (int i = 0; i < paragraphs.length; i++) { System.out.println("Length of paragraph "+(i +1)+": "+ paragraphs[i].length()); System.out.println(paragraphs[i].toString()); } }
boolean isHidden = false; try { fs = new POIFSFileSystem(new FileInputStream(filesname)); HWPFDocument doc = new HWPFDocument(fs); WordExtractor we = new WordExtractor(doc); String[] paragraphs = we.getParagraphText(); System.out.println("Word Document has " + paragraphs.length + " paragraphs"); Range range = doc.getRange(); for (int k = 0; k < range.numParagraphs(); k++) { org.apache.poi.hwpf.usermodel.Paragraph paragraph = range .getParagraph(k); paragraph.text().trim(); paragraph.text().replaceAll("\\cM?\r?\n", ""); for (int j = 0; j < paragraph.numCharacterRuns(); j++) { org.apache.poi.hwpf.usermodel.CharacterRun cr = paragraph .getCharacterRun(j); if (cr.isVanished()) { // it is hidden System.out.println("text is hidden "); isHidden = true; break; } }
public static String docText(File f) { try { if (toLowerCase(f.getName()).endsWith(FILE_DOC)) { FileInputStream fis = new FileInputStream(f); WordExtractor ex = new WordExtractor(fis); String text = ex.getText(); text = text.replaceAll("(\\r\\n){2,}", "\r\n").replaceAll("(\\n){2,}", "\n"); fis.close(); return trim(text); } } catch (Exception e) { LOG.error(e.getLocalizedMessage(), e); } return EMPTY; }
FileInputStream fis = new FileInputStream(file.getAbsolutePath()); HWPFDocument docs = new HWPFDocument(fis); extractor = new WordExtractor(docs); String[] fileData = extractor.getParagraphText(); for (int i = 0; i < fileData.length; i++) data+=fileData[i]; fis.close(); file = new File("file2.doc"); fis = new FileInputStream(file.getAbsolutePath()); docs = new HWPFDocument(fis); extractor = new WordExtractor(docs); fileData = extractor.getParagraphText(); for (int i = 0; i < fileData.length; i++)
FileInputStream fis = new FileInputStream(file.getAbsolutePath()); HWPFDocument document = new HWPFDocument(fis); WordExtractor extractor = new WordExtractor(document); String rawText = extractor.getText(); String displayText = extractor.stripFields(rawText);
FileInputStream fis = new FileInputStream(file); WordExtractor extractor = new WordExtractor(fs.getRoot()); for (String rawText : extractor.getParagraphText()) { lines.add(extractor.stripFields(rawText)); extractor.close(); } catch (IOException e) { } finally { try { fis.close(); } catch (IOException ioex) {
//you can use the org.apache.poi.hwpf.extractor.WordExtractor to get the text String fileName = "example.doc"; HWPFDocument wordDoc = new HWPFDocument(new FileInputStream(fileName)); WordExtractor extractor = new WordExtractor(wordDoc); String[] text = extractor.getParagraphText(); int lineCounter = text.length; String articleStr = ""; // This string object use to store text from the word document. for(int index = 0;index < lineCounter;++ index){ String paragraphStr = text[index].replaceAll("\r\n","").replaceAll("\n","").trim(); int paragraphLength = paragraphStr.length(); if(paragraphLength != 0){ articleStr.concat(paragraphStr); } } //you can use the org.apache.poi.hwpf.usermodel.Picture to get the image List<Picture> picturesList = wordDoc.getPicturesTable().getAllPictures(); for(int i = 0;i < picturesList.size();++i){ BufferedImage image = null; Picture pic = picturesList.get(i); image = ImageIO.read(new ByteArrayInputStream(pic.getContent())); if(image != null){ System.out.println("Image["+i+"]"+" ImageWidth:"+image.getWidth()+" ImageHeight:"+image.getHeight()+" Suggest Image Format:"+pic.suggestFileExtension()); } }
fis = new FileInputStream(new File(FilePath)); XWPFDocument doc = new XWPFDocument(fis); XWPFWordExtractor extract = new XWPFWordExtractor(doc); fis = new FileInputStream(new File(FilePath)); HWPFDocument doc = new HWPFDocument(fis); WordExtractor extractor = new WordExtractor(doc); System.out.println(extractor.getText()); } catch (IOException e) { e.printStackTrace();
/** * Extrae el texto de un fichero word. * @param in * @return String. Devuelve el texto crudo * @throws Exception */ public static String extractText(InputStream in) throws Exception { String result = ""; HWPFDocument doc = new HWPFDocument(in); WordExtractor we = new WordExtractor(doc); result = we.getText(); // Eliminamos los caracteres que no nos sirven para indexar. result = ExtractorUtil.removeControlChars(result); return result; }
/** * initialize the word document from an input stream * * @param is */ public void init(InputStream is) { try { POIFSFileSystem fs = new POIFSFileSystem(is); doc = new HWPFDocument(fs); we = new WordExtractor(doc); range = doc.getRange(); } catch (Throwable th) { error = th; } } }
WordExtractor we = new WordExtractor(new HWPFDocument(fis));
if(version.equals(ContentHandler.VERSION_2003)) WordExtractor ex = new WordExtractor(in); result = ex.getText(); SummaryInformation info = ex.getSummaryInformation(); this.m_summary = info; this.m_documentSummary = ex.getDocSummaryInformation(); metaInfo = extractMetaInformation();
private void currentWordExtraction(final InputStream inputStream, final ParserResultBuilder resultBuilder) throws IOException { try (final WordExtractor word = new WordExtractor(inputStream)) { final SummaryInformation info = word.getSummaryInformation(); if (info != null) { final ParserFieldsBuilder metas = resultBuilder.metas(); metas.set(MIME_TYPE, DEFAULT_MIMETYPES[0]); metas.add(TITLE, info.getTitle()); metas.add(AUTHOR, info.getAuthor()); metas.add(SUBJECT, info.getSubject()); metas.add(CREATION_DATE, info.getCreateDateTime()); metas.add(MODIFICATION_DATE, info.getLastSaveDateTime()); metas.add(KEYWORDS, info.getKeywords()); } final ParserFieldsBuilder document = resultBuilder.newDocument(); final String[] paragraphes = word.getParagraphText(); if (paragraphes != null) for (String paragraph : paragraphes) document.add(CONTENT, paragraph); document.add(LANG_DETECTION, languageDetection(document, CONTENT, 10000)); } }
/** * Get the text from the word file, as an array with one String per * paragraph */ public String[] getParagraphText() { String[] ret; // Extract using the model code try { Range r = doc.getRange(); ret = getParagraphText( r ); } catch ( Exception e ) { // Something's up with turning the text pieces into paragraphs // Fall back to ripping out the text pieces ret = new String[1]; ret[0] = getTextFromPieces(); } return ret; }