org.apache.poi.hwpf.extractor.WordExtractor java code examples

Refine search

/**
 * Command line extractor, so people will stop moaning that they can't just
 * run this.
 */
public static void main( String[] args ) throws IOException
{
  if ( args.length == 0 )
  {
    System.err.println( "Use:" );
    System.err
        .println( "   java org.apache.poi.hwpf.extractor.WordExtractor <filename>" );
    System.exit( 1 );
  }
  // Process the first argument as a file
  FileInputStream fin = new FileInputStream( args[0] );
  WordExtractor extractor = new WordExtractor( fin );
  System.out.println( extractor.getText() );
}

FileInputStream fis = new FileInputStream(file.getAbsolutePath());
HWPFDocument document = new HWPFDocument(fis);
extractor = new WordExtractor(document);
String[] fileData = extractor.getParagraphText();
for (int i = 0; i < fileData.length; i++)

/**
 * Command line extractor, so people will stop moaning that they can't just
 * run this.
 */
public static void main( String[] args ) throws IOException {
  if ( args.length == 0 ) {
    System.err.println( "Use:" );
    System.err
        .println( "   java org.apache.poi.hwpf.extractor.WordExtractor <filename>" );
    System.exit( 1 );
  }
  // Process the first argument as a file
  InputStream fin = new FileInputStream( args[0] );
  WordExtractor extractor = new WordExtractor( fin );
  try {
    System.out.println( extractor.getText() );
  } finally {
    extractor.close();
  }
}

HWPFDocument document;
try {
  document = new HWPFDocument(root);
} catch (org.apache.poi.EncryptedDocumentException e) {
    throw new EncryptedDocumentException(e);
    new org.apache.poi.hwpf.extractor.WordExtractor(document);
PicturesTable pictureTable = document.getPicturesTable();
PicturesSource pictures = new PicturesSource(document);
HeaderStories headerFooter = null;
Range r = document.getRange();
ListManager listManager = new ListManager(document);
for (int i = 0; i < r.numParagraphs(); i++) {
  for (String paragraph : wordExtractor.getMainTextboxText()) {
    xhtml.element("p", paragraph);
for (String paragraph : wordExtractor.getFootnoteText()) {
  xhtml.element("p", paragraph);
for (String paragraph : wordExtractor.getCommentsText()) {
  xhtml.element("p", paragraph);
for (String paragraph : wordExtractor.getEndnoteText()) {
  xhtml.element("p", paragraph);

/**
 * {@inheritDoc}
 */
@Override
protected String extractText(POIFSFileSystem poiFs, long filesize, ContentParserOptions options)
  throws Exception {
 // DocumentEntry documentEntry = (DocumentEntry)
 // poiFs.getRoot().getEntry(POIFS_WORD_DOC);
 // DocumentInputStream documentInputStream =
 // poiFs.createDocumentInputStream(POIFS_ENTRY);
 WordExtractor extractor = new WordExtractor(poiFs);
 return extractor.getText();
}

 WordExtractor extractor = new WordExtractor(document);
paragraphs.addAll(Arrays.asList(extractor.getParagraphText()) ); 
footnotes.addAll(Arrays.asList(extractor.getFootnoteText()) );
extractor.close();

 public static void readDocxFile(String fileName) {

  try {
    File file = new File(fileName);
    POIFSFileSystem fs = null;
    fs = new POIFSFileSystem(new FileInputStream(file.getAbsolutePath()));
    HWPFDocument doc = new HWPFDocument(fs);
    readParagraphs(doc);

  } catch (Exception e) {
    e.printStackTrace();
  }
}

public static void readParagraphs(HWPFDocument doc) throws Exception{
    WordExtractor we = new WordExtractor(doc);

    /**Get the total number of paragraphs**/
    String[] paragraphs = we.getParagraphText();
    System.out.println("Total Paragraphs: "+paragraphs.length);

    for (int i = 0; i < paragraphs.length; i++) {

      System.out.println("Length of paragraph "+(i +1)+": "+ paragraphs[i].length());
      System.out.println(paragraphs[i].toString());

    }

  }

boolean isHidden = false;
 try {
   fs = new POIFSFileSystem(new FileInputStream(filesname));
   HWPFDocument doc = new HWPFDocument(fs);
   WordExtractor we = new WordExtractor(doc);
   String[] paragraphs = we.getParagraphText();
   System.out.println("Word Document has " + paragraphs.length
       + " paragraphs");
   Range range = doc.getRange();
   for (int k = 0; k < range.numParagraphs(); k++) {
     org.apache.poi.hwpf.usermodel.Paragraph paragraph = range
         .getParagraph(k);
     paragraph.text().trim();
     paragraph.text().replaceAll("\\cM?\r?\n", "");
     for (int j = 0; j < paragraph.numCharacterRuns(); j++) {
       org.apache.poi.hwpf.usermodel.CharacterRun cr = paragraph
           .getCharacterRun(j);
       if (cr.isVanished()) {
         // it is hidden
         System.out.println("text is hidden ");
         isHidden = true;
         break;
       }
     }

public static String docText(File f) {
  try {
    if (toLowerCase(f.getName()).endsWith(FILE_DOC)) {
      FileInputStream fis = new FileInputStream(f);
      WordExtractor ex = new WordExtractor(fis);
      String text = ex.getText();
      text = text.replaceAll("(\\r\\n){2,}", "\r\n").replaceAll("(\\n){2,}", "\n");
      fis.close();
      return trim(text);
    }
  } catch (Exception e) {
    LOG.error(e.getLocalizedMessage(), e);
  }
  return EMPTY;
}

FileInputStream fis = new FileInputStream(file.getAbsolutePath());
HWPFDocument docs = new HWPFDocument(fis);
extractor = new WordExtractor(docs);
String[] fileData = extractor.getParagraphText();
for (int i = 0; i < fileData.length; i++)
    data+=fileData[i];
fis.close();
file = new File("file2.doc");
fis = new FileInputStream(file.getAbsolutePath());
docs = new HWPFDocument(fis);
extractor = new WordExtractor(docs);
fileData = extractor.getParagraphText();
for (int i = 0; i < fileData.length; i++)

FileInputStream fis = new FileInputStream(file.getAbsolutePath());
 HWPFDocument document = new HWPFDocument(fis);
 WordExtractor extractor = new WordExtractor(document);
 String rawText = extractor.getText();
 String displayText = extractor.stripFields(rawText);

FileInputStream fis = new FileInputStream(file);
    WordExtractor extractor = new WordExtractor(fs.getRoot());
    for (String rawText : extractor.getParagraphText()) {
      lines.add(extractor.stripFields(rawText));
    extractor.close();
  } catch (IOException e) {
  } finally {
    try {
      fis.close();
    } catch (IOException ioex) {

//you can use the org.apache.poi.hwpf.extractor.WordExtractor to get the text
 String fileName = "example.doc";
 HWPFDocument wordDoc = new HWPFDocument(new FileInputStream(fileName));
 WordExtractor extractor = new WordExtractor(wordDoc);
 String[] text = extractor.getParagraphText();
 int lineCounter = text.length;
 String articleStr = ""; // This string object use to store text from the word document.
 for(int index = 0;index < lineCounter;++ index){
   String paragraphStr = text[index].replaceAll("\r\n","").replaceAll("\n","").trim();
   int paragraphLength = paragraphStr.length();
   if(paragraphLength != 0){
     articleStr.concat(paragraphStr);
   }
 }
 //you can use the org.apache.poi.hwpf.usermodel.Picture to get the image
 List<Picture> picturesList = wordDoc.getPicturesTable().getAllPictures();
 for(int i = 0;i < picturesList.size();++i){
   BufferedImage image = null;
   Picture pic = picturesList.get(i);
   image = ImageIO.read(new ByteArrayInputStream(pic.getContent()));
   if(image != null){
     System.out.println("Image["+i+"]"+" ImageWidth:"+image.getWidth()+" ImageHeight:"+image.getHeight()+" Suggest Image Format:"+pic.suggestFileExtension());
   }
 }

fis = new FileInputStream(new File(FilePath));
XWPFDocument doc = new XWPFDocument(fis);
XWPFWordExtractor extract = new XWPFWordExtractor(doc);
  fis = new FileInputStream(new File(FilePath));
  HWPFDocument doc = new HWPFDocument(fis);
  WordExtractor extractor = new WordExtractor(doc);
  System.out.println(extractor.getText());
} catch (IOException e) {
  e.printStackTrace();

/**
 * Extrae el texto de un fichero word.
 * @param in
 * @return String. Devuelve el texto crudo
 * @throws Exception
 */
public static String extractText(InputStream in) throws Exception {
  String result = "";
  HWPFDocument doc = new HWPFDocument(in);
  WordExtractor we = new WordExtractor(doc);
  result = we.getText();
  // Eliminamos los caracteres que no nos sirven para indexar.
  result = ExtractorUtil.removeControlChars(result);
  return result;
}

 /**
  * initialize the word document from an input stream
  * 
  * @param is
  */
 public void init(InputStream is) {
  try {
   POIFSFileSystem fs = new POIFSFileSystem(is);
   doc = new HWPFDocument(fs);
   we = new WordExtractor(doc);
   range = doc.getRange();
  } catch (Throwable th) {
   error = th;
  }
 }
}

WordExtractor we = new WordExtractor(new HWPFDocument(fis));

if(version.equals(ContentHandler.VERSION_2003))
  WordExtractor ex  = new WordExtractor(in);
  result = ex.getText();   
   SummaryInformation info = ex.getSummaryInformation();
   this.m_summary = info;
   this.m_documentSummary = ex.getDocSummaryInformation();
   metaInfo = extractMetaInformation();

private void currentWordExtraction(final InputStream inputStream, final ParserResultBuilder resultBuilder)
    throws IOException {
  try (final WordExtractor word = new WordExtractor(inputStream)) {
    final SummaryInformation info = word.getSummaryInformation();
    if (info != null) {
      final ParserFieldsBuilder metas = resultBuilder.metas();
      metas.set(MIME_TYPE, DEFAULT_MIMETYPES[0]);
      metas.add(TITLE, info.getTitle());
      metas.add(AUTHOR, info.getAuthor());
      metas.add(SUBJECT, info.getSubject());
      metas.add(CREATION_DATE, info.getCreateDateTime());
      metas.add(MODIFICATION_DATE, info.getLastSaveDateTime());
      metas.add(KEYWORDS, info.getKeywords());
    }
    final ParserFieldsBuilder document = resultBuilder.newDocument();
    final String[] paragraphes = word.getParagraphText();
    if (paragraphes != null)
      for (String paragraph : paragraphes)
        document.add(CONTENT, paragraph);
    document.add(LANG_DETECTION, languageDetection(document, CONTENT, 10000));
  }
}

/**
 * Get the text from the word file, as an array with one String per
 * paragraph
 */
public String[] getParagraphText() {
  String[] ret;
  // Extract using the model code
  try {
    Range r = doc.getRange();
    ret = getParagraphText( r );
  } catch ( Exception e ) {
    // Something's up with turning the text pieces into paragraphs
    // Fall back to ripping out the text pieces
    ret = new String[1];
    ret[0] = getTextFromPieces();
  }
  return ret;
}

Javadoc

Class to extract the text from a Word Document. You should use either getParagraphText() or getText() unless you have a strong reason otherwise.

Most used methods

<init>
Create a new Word Extractor
getText
Grab the text, based on the WordToTextConverter. Shouldn't include any crud, but slower than getText
getParagraphText
close
getFootnoteText
appendHeaderFooter
Add the header/footer text, if it's not empty
getCommentsText
getEndnoteText
getMainTextboxText
getSummaryInformation
getTextFromPieces
Grab the text out of the text pieces. Might also include various bits of crud, but will work in case
stripFields
Removes any fields (eg macros, page markers etc) from the string.

Popular in Java

Making http requests using okhttp
getContentResolver (Context)
scheduleAtFixedRate (ScheduledExecutorService)
setContentView (Activity)
FileReader (java.io)
A specialized Reader that reads from a file in the file system. All read requests made by calling me
Calendar (java.util)
Calendar is an abstract base class for converting between a Date object and a set of integer fields
NoSuchElementException (java.util)
Thrown when trying to retrieve an element past the end of an Enumeration or Iterator.
SortedSet (java.util)
SortedSet is a Set which iterates over its elements in a sorted order. The order is determined eithe
TreeMap (java.util)
Walk the nodes of the tree left-to-right or right-to-left. Note that in descending iterations, next
Manifest (java.util.jar)
The Manifest class is used to obtain attribute information for a JarFile and its entries.
Top 12 Jupyter Notebook extensions

How to useWordExtractor in org.apache.poi.hwpf.extractor

Best Java code snippets using org.apache.poi.hwpf.extractor.WordExtractor (Showing top 20 results out of 315)

Refine search

How to use
WordExtractor
in
org.apache.poi.hwpf.extractor