/** * Fetches all the notes text from the slideshow, but not the slide text */ public String getNotes() { return getText(false, true, false, false); }
/** * Fetches the ALL the text of the powerpoint file, in a List of * strings, one per text record */ public List<String> getTextAsVector() { List<String> textV = new ArrayList<>(); // Set to the start of the file int walkPos = 0; // Start walking the file, looking for the records while(walkPos != -1) { walkPos = findTextRecords(walkPos,textV); } // Return what we find return textV; }
/** * Really basic text extractor, that will also return lots of crud text. * Takes a single argument, the file to extract from */ public static void main(String args[]) throws IOException { if(args.length < 1) { System.err.println("Useage:"); System.err.println("\tQuickButCruddyTextExtractor <file>"); System.exit(1); } String file = args[0]; QuickButCruddyTextExtractor ppe = new QuickButCruddyTextExtractor(file); System.out.println(ppe.getTextAsString()); ppe.close(); }
/** * Basic extractor. Returns all the text, and optionally all the notes */ public static void main(String args[]) throws IOException { if (args.length < 1) { System.err.println("Useage:"); System.err.println("\tPowerPointExtractor [-notes] <file>"); System.exit(1); } boolean notes = false; boolean comments = false; boolean master = true; String file; if (args.length > 1) { notes = true; file = args[1]; if (args.length > 2) { comments = true; } } else { file = args[0]; } PowerPointExtractor ppe = new PowerPointExtractor(file); System.out.println(ppe.getText(true, notes, comments, master)); }
/** * Basic extractor. Returns all the text, and optionally all the notes */ public static void main(String args[]) throws IOException { if (args.length < 1) { System.err.println("Useage:"); System.err.println("\tPowerPointExtractor [-notes] <file>"); System.exit(1); } boolean notes = false; boolean comments = false; boolean master = true; String file; if (args.length > 1) { notes = true; file = args[1]; if (args.length > 2) { comments = true; } } else { file = args[0]; } PowerPointExtractor ppe = new PowerPointExtractor(file); System.out.println(ppe.getText(true, notes, comments, master)); ppe.close(); }
/** * Fetches the ALL the text of the powerpoint file, as a single string */ public String getTextAsString() { StringBuffer ret = new StringBuffer(); List<String> textV = getTextAsVector(); for(String text : textV) { ret.append(text); if(! text.endsWith("\n")) { ret.append('\n'); } } return ret.toString(); }
public PowerPointExtractor(final HSLFSlideShow slideShow) { super(slideShow.getSlideShowImpl()); setFilesystem(slideShow); delegate = new SlideShowExtractor<>(slideShow); }
/** * {@inheritDoc} */ public Reader extractText(InputStream stream, String type, String encoding) throws IOException { try { PowerPointExtractor extractor = new PowerPointExtractor(stream); return new StringReader(extractor.getText(true, true)); } catch (RuntimeException e) { logger.warn("Failed to extract PowerPoint text content", e); return new StringReader(""); } finally { try { stream.close(); } catch (IOException ignored) { } } } }
/** * Fetches all the notes text from the slideshow, but not the slide text */ public String getNotes() { return getText(false, true); }
/** * Really basic text extractor, that will also return lots of crud text. * Takes a single argument, the file to extract from */ public static void main(String args[]) throws IOException { if(args.length < 1) { System.err.println("Useage:"); System.err.println("\tQuickButCruddyTextExtractor <file>"); System.exit(1); } String file = args[0]; QuickButCruddyTextExtractor ppe = new QuickButCruddyTextExtractor(file); System.out.println(ppe.getTextAsString()); ppe.close(); }
/** * Fetches the ALL the text of the powerpoint file, in a vector of * strings, one per text record */ public Vector<String> getTextAsVector() { Vector<String> textV = new Vector<String>(); // Set to the start of the file int walkPos = 0; // Start walking the file, looking for the records while(walkPos != -1) { int newPos = findTextRecords(walkPos,textV); walkPos = newPos; } // Return what we find return textV; }
/** * Fetches the ALL the text of the powerpoint file, as a single string */ public String getTextAsString() { StringBuffer ret = new StringBuffer(); Vector<String> textV = getTextAsVector(); for(String text : textV) { ret.append(text); if(! text.endsWith("\n")) { ret.append('\n'); } } return ret.toString(); }
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new RobotSystemException("The inputstream is null."); } try { return new ExtractData( new org.apache.poi.hslf.extractor.PowerPointExtractor(in) .getText()); } catch (final IOException e) { throw new ExtractException(e); } }
/** * Fetches all the slide text from the slideshow, but not the notes, unless * you've called setSlidesByDefault() and setNotesByDefault() to change this */ public String getText() { return getText(_slidesByDefault, _notesByDefault, _commentsByDefault, _masterByDefault); }
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new RobotSystemException("The inputstream is null."); } try { return new ExtractData( new org.apache.poi.hslf.extractor.PowerPointExtractor(in) .getText()); } catch (final IOException e) { throw new ExtractException(e); } }
/** * Fetches text from the slideshow, be it slide text or note text. Because * the final block of text in a TextRun normally have their last \n * stripped, we add it back * * @param getSlideText fetch slide text * @param getNoteText fetch note text */ public String getText(boolean getSlideText, boolean getNoteText) { return getText(getSlideText, getNoteText, _commentsByDefault, _masterByDefault); }
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new RobotSystemException("The inputstream is null."); } try { return new ExtractData( new org.apache.poi.hslf.extractor.PowerPointExtractor(in) .getText()); } catch (final IOException e) { throw new ExtractException(e); } }
/** * Fetches text from the slideshow, be it slide text or note text. Because * the final block of text in a TextRun normally have their last \n * stripped, we add it back * * @param getSlideText fetch slide text * @param getNoteText fetch note text */ public String getText(boolean getSlideText, boolean getNoteText) { return getText(getSlideText,getNoteText,commentsByDefault,masterByDefault); }
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new CrawlerSystemException("The inputstream is null."); } try { @SuppressWarnings("resource") final org.apache.poi.hslf.extractor.PowerPointExtractor powerPointExtractor = new org.apache.poi.hslf.extractor.PowerPointExtractor(in); return new ExtractData(powerPointExtractor.getText()); } catch (final IOException e) { throw new ExtractException(e); } }
public IndexDocument getIndexedDocument(File2Index fileData) throws SolrException { try { POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data)); PowerPointExtractor extractor = new PowerPointExtractor(fs); String ppText = extractor.getText(); return new IndexDocument(fileData.path, ppText, null); } catch (IOException e) { String msg = "Failed to write to the index"; log.error(msg, e); throw new SolrException(ErrorCode.SERVER_ERROR, msg); } }