SheetTextExtractor sheetExtractor = new SheetTextExtractor(); while (iter.hasNext()) { InputStream stream = iter.next(); if (includeSheetNames) { text.append(iter.getSheetName()); text.append('\n'); Comments comments = includeCellComments ? iter.getSheetComments() : null; processSheet(sheetExtractor, styles, comments, strings, stream); if (includeHeadersFooters) { processShapes(iter.getShapes(), text);
/** * Returns the shapes associated with this sheet, * an empty list or null if there is an exception */ public List<XSSFShape> getShapes() { PackagePart sheetPkg = getSheetPart(); List<XSSFShape> shapes = new LinkedList<>(); // Do we have a comments relationship? (Only ever one if so) try { PackageRelationshipCollection drawingsList = sheetPkg.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation()); for (int i = 0; i < drawingsList.size(); i++) { PackageRelationship drawings = drawingsList.getRelationship(i); PackagePartName drawingsName = PackagingURIHelper.createPartName(drawings.getTargetURI()); PackagePart drawingsPart = sheetPkg.getPackage().getPart(drawingsName); if (drawingsPart == null) { //parts can go missing; Excel ignores them silently -- TIKA-2134 LOGGER.log(POILogger.WARN, "Missing drawing: " + drawingsName + ". Skipping it."); continue; } XSSFDrawing drawing = new XSSFDrawing(drawingsPart); shapes.addAll(drawing.getShapes()); } } catch (XmlException|InvalidFormatException|IOException e) { LOGGER.log(POILogger.WARN, e); return null; } return shapes; }
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator)r.getSheetsData(); int sheetNumber = 0; while (iter.hasNext()) { if (this.sheets!=null) { for (int i=0;i<this.sheets.length;i++) { if (iter.getSheetName().equals(this.sheets[i])) { parse=true; break; InputStream rawSheetInputStream = iter.next(); this.sheetNameList.add(iter.getSheetName()); InputSource rawSheetInputSource = new InputSource(rawSheetInputStream); if (HadoopOfficeReadConfiguration.OPTION_LOWFOOTPRINT_PARSER_SAX.equalsIgnoreCase(this.hocr.getLowFootprintParser())) { LOG.info("Using SAX parser for low footprint Excel parsing"); XMLReader sheetParser = SAXHelper.newXMLReader(); XSSFEventParser xssfp = new XSSFEventParser(sheetNumber,iter.getSheetName(), this.spreadSheetCellDAOCache); this.styles, iter.getSheetComments(), this.pushSST, xssfp, this.useDataFormatter, false); sheetParser.setContentHandler(handler); sheetParser.parse(rawSheetInputSource); this.event=false; this.pullSheetInputList.add(rawSheetInputStream); this.pullSheetNameList.add(iter.getSheetName());
while (iter.hasNext()) { InputStream stream = iter.next(); PackagePart sheetPart = iter.getSheetPart(); addDrawingHyperLinks(sheetPart); sheetParts.add(sheetPart); CommentsTable comments = iter.getSheetComments(); xhtml.element("h1", iter.getSheetName()); shapes = iter.getShapes(); } catch (NullPointerException e) {
while (iter.hasNext()) { SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(config, xhtml); PackagePart sheetPart = null; try (InputStream stream = iter.next()) { sheetPart = iter.getSheetPart(); CommentsTable comments = iter.getSheetComments(); xhtml.element("h1", iter.getSheetName()); List<XSSFShape> shapes = iter.getShapes(); processShapes(shapes, xhtml);
SheetTextExtractor sheetExtractor = new SheetTextExtractor(); while (iter.hasNext()) { InputStream stream = iter.next(); if (includeSheetNames) { text.append(iter.getSheetName()); text.append('\n'); Comments comments = includeCellComments ? iter.getSheetComments() : null; processSheet(sheetExtractor, styles, comments, strings, stream); if (includeHeadersFooters) { processShapes(iter.getShapes(), text);
SheetTextExtractor sheetExtractor = new SheetTextExtractor(text); while (iter.hasNext()) { InputStream stream = iter.next(); if(includeSheetNames) { text.append(iter.getSheetName()); text.append('\n');
protected <T> void unmarshal0(Class<T> type, Consumer<? super T> consumer, OPCPackage open) throws ParserConfigurationException, IOException, SAXException, OpenXML4JException { ReadOnlySharedStringsTable readOnlySharedStringsTable = new ReadOnlySharedStringsTable(open); XSSFReader workbookReader = new XSSFReader(open); StylesTable styles = workbookReader.getStylesTable(); XMLReader reader = SAXHelper.newXMLReader(); InputSource is = new InputSource(workbookReader.getWorkbookData()); reader.setContentHandler(new WorkBookContentHandler(options)); reader.parse(is); WorkBookContentHandler wbch = (WorkBookContentHandler) reader.getContentHandler(); List<WorkBookSheet> sheets = wbch.getSheets(); int requestedIndex = options.sheetIndex(); int nonHiddenSheetIndex = 0; int sheetCounter = 0; SheetIterator iter = (SheetIterator) workbookReader.getSheetsData(); while (iter.hasNext()) { try (InputStream stream = iter.next()) { WorkBookSheet wbs = sheets.get(sheetCounter); if (wbs.getState().equals("visible")) { if (nonHiddenSheetIndex == requestedIndex) { processSheet(styles, reader, readOnlySharedStringsTable, type, stream, consumer); return; } nonHiddenSheetIndex++; } } sheetCounter++; } }
/** * Returns the shapes associated with this sheet, * an empty list or null if there is an exception */ public List<XSSFShape> getShapes() { PackagePart sheetPkg = getSheetPart(); List<XSSFShape> shapes = new LinkedList<>(); // Do we have a comments relationship? (Only ever one if so) try { PackageRelationshipCollection drawingsList = sheetPkg.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation()); for (int i = 0; i < drawingsList.size(); i++) { PackageRelationship drawings = drawingsList.getRelationship(i); PackagePartName drawingsName = PackagingURIHelper.createPartName(drawings.getTargetURI()); PackagePart drawingsPart = sheetPkg.getPackage().getPart(drawingsName); if (drawingsPart == null) { //parts can go missing; Excel ignores them silently -- TIKA-2134 LOGGER.log(POILogger.WARN, "Missing drawing: " + drawingsName + ". Skipping it."); continue; } XSSFDrawing drawing = new XSSFDrawing(drawingsPart); shapes.addAll(drawing.getShapes()); } } catch (XmlException|InvalidFormatException|IOException e) { LOGGER.log(POILogger.WARN, e); return null; } return shapes; }
/** * Initiates the processing of the XLS workbook file to CSV. * * @throws IOException If reading the data from the package fails. * @throws SAXException if parsing the XML data fails. */ public void process(Reader reader, SheetToCSV sheetToCSV) throws IOException, OpenXML4JException, SAXException { ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(sheetToCSV.getOpcPackage()); XSSFReader xssfReader = new XSSFReader(sheetToCSV.getOpcPackage()); StylesTable styles = xssfReader.getStylesTable(); XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); int index = 0; boolean bySheetName = StringUtil.isNotEmpty(reader.sheetName()); while (iter.hasNext()) { try (InputStream stream = iter.next()) { String sheetName = iter.getSheetName(); if (bySheetName && reader.sheetName().equals(sheetName)) { processSheet(styles, strings, sheetToCSV, stream); break; } if (!bySheetName && reader.sheetIndex() == index) { processSheet(styles, strings, sheetToCSV, stream); break; } } ++index; } }
OPCPackage opcPackage = OPCPackage.open(originalFile, PackageAccess.READ); ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(opcPackage); XSSFReader xssfReader = new XSSFReader(opcPackage); StylesTable styles = xssfReader.getStylesTable(); XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); int index = 0; while (iter.hasNext()) { InputStream stream = iter.next(); String sheetName = iter.getSheetName(); DataFormatter formatter = new DataFormatter(); InputSource sheetSource = new InputSource(stream); SheetToWorkbookSaver saver = new SheetToWorkbookSaver(sheetName); try { XMLReader sheetParser = SAXHelper.newXMLReader(); ContentHandler handler = new XSSFSheetXMLHandler( styles, null, strings, saver, formatter, false); sheetParser.setContentHandler(handler); sheetParser.parse(sheetSource); } catch(ParserConfigurationException e) { throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage()); } stream.close(); // this creates new File descriptors inside storage FileDto partFile = new FileDto("report_".concat(StringUtils.trimToEmpty(sheetName)).concat(".xlsx")); File cloneFile = fileStorage.read(partFile); FileOutputStream cloneFos = new FileOutputStream(cloneFile); saver.getWb().write(cloneFos); cloneFos.close(); }
/** * Initiates the processing of the XLS workbook file to CSV. * * @throws IOException * @throws OpenXML4JException * @throws ParserConfigurationException * @throws SAXException */ public List<String> process() throws IOException, OpenXML4JException, ParserConfigurationException, SAXException { ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(this.xlsxPackage); XSSFReader xssfReader = new XSSFReader(this.xlsxPackage); StylesTable styles = xssfReader.getStylesTable(); XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); int index = 0; while (iter.hasNext()) { if(blankRowNum == 10)break; InputStream stream = iter.next(); String sheetName = iter.getSheetName(); results.add(ExcelValidator.SHEET_NAME_PREFIX + sheetName); processSheet(styles, strings, new SheetToCSV(), stream); stream.close(); ++index; } return results; }
@Override public void loadData(List<Tuple<String, File>> files, Importer importer) throws InvalidFileException, IOException { try { RowCellHandler rowCellHandler = makeRowCellHandler(importer); SAXParserFactory saxFactory = SAXParserFactory.newInstance(); saxFactory.setNamespaceAware(true); for (Tuple<String, File> file : files) { OPCPackage pkg = OPCPackage.open(file.getRight().getPath()); XSSFReader xssfReader = new XSSFReader(pkg); final SharedStringsTable sharedStringsTable = xssfReader.getSharedStringsTable(); XSSFReader.SheetIterator worksheets = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); while (worksheets.hasNext()) { final InputStream sheet = worksheets.next(); XMLReader sheetParser = saxFactory.newSAXParser().getXMLReader(); sheetParser.setContentHandler(new SheetXmlParser(sharedStringsTable, rowCellHandler)); rowCellHandler.start(worksheets.getSheetName()); sheetParser.parse(new InputSource(sheet)); rowCellHandler.finish(); } } } catch (SAXException | OpenXML4JException | ParserConfigurationException e) { throw new InvalidFileException("Not a valid Excel file", e); } }
/** * Returns the comments associated with this sheet, * or null if there aren't any */ public CommentsTable getSheetComments() { PackagePart sheetPkg = getSheetPart(); // Do we have a comments relationship? (Only ever one if so) try { PackageRelationshipCollection commentsList = sheetPkg.getRelationshipsByType(XSSFRelation.SHEET_COMMENTS.getRelation()); if (commentsList.size() > 0) { PackageRelationship comments = commentsList.getRelationship(0); PackagePartName commentsName = PackagingURIHelper.createPartName(comments.getTargetURI()); PackagePart commentsPart = sheetPkg.getPackage().getPart(commentsName); return new CommentsTable(commentsPart); } } catch (InvalidFormatException|IOException e) { LOGGER.log(POILogger.WARN, e); return null; } return null; }
/** * Returns the comments associated with this sheet, * or null if there aren't any */ public CommentsTable getSheetComments() { PackagePart sheetPkg = getSheetPart(); // Do we have a comments relationship? (Only ever one if so) try { PackageRelationshipCollection commentsList = sheetPkg.getRelationshipsByType(XSSFRelation.SHEET_COMMENTS.getRelation()); if(commentsList.size() > 0) { PackageRelationship comments = commentsList.getRelationship(0); PackagePartName commentsName = PackagingURIHelper.createPartName(comments.getTargetURI()); PackagePart commentsPart = sheetPkg.getPackage().getPart(commentsName); return new CommentsTable(commentsPart, comments); } } catch (InvalidFormatException e) { return null; } catch (IOException e) { return null; } return null; }
/** * Initiates the processing of the XLS workbook file to CSV. * * @throws IOException If reading the data from the package fails. * @throws SAXException if parsing the XML data fails. */ public void process() throws IOException, OpenXML4JException, SAXException { ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(this.xlsxPackage); XSSFReader xssfReader = new XSSFReader(this.xlsxPackage); StylesTable styles = xssfReader.getStylesTable(); XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); int index = 0; while (iter.hasNext()) { try (InputStream stream = iter.next()) { String sheetName = iter.getSheetName(); this.output.println(); this.output.println(sheetName + " [index=" + index + "]:"); processSheet(styles, strings, new SheetToCSV(), stream); } ++index; } }
/** * Get the sheet that should be parsed * @param options The options * @param iter The sheet iterator * @return A sheet that matches the name if the sheet name is provided */ private InputStream getSheetForParsing(ExcelSourceOptions<R> options, XSSFReader.SheetIterator iter) { while(iter.hasNext()){ InputStream inputStream = iter.next(); final String sheetName = iter.getSheetName(); if(options.getSheetName() == null){ return inputStream; }else{ if( sheetName.equals(options.getSheetName())){ return inputStream; } } } throw new DataFrameException("No sheet found for that matched configured sheet " + options.getSheetName()); }
void loadSheets(XSSFReader reader, SharedStringsTable sst, StylesTable stylesTable, int rowCacheSize) throws IOException, InvalidFormatException, XMLStreamException { lookupSheetNames(reader); //Some workbooks have multiple references to the same sheet. Need to filter //them out before creating the XMLEventReader by keeping track of their URIs. //The sheets are listed in order, so we must keep track of insertion order. SheetIterator iter = (SheetIterator) reader.getSheetsData(); Map<URI, InputStream> sheetStreams = new LinkedHashMap<>(); while(iter.hasNext()) { InputStream is = iter.next(); sheetStreams.put(iter.getSheetPart().getPartName().getURI(), is); } //Iterate over the loaded streams int i = 0; for(URI uri : sheetStreams.keySet()) { XMLEventReader parser = StaxHelper.newXMLInputFactory().createXMLEventReader(sheetStreams.get(uri)); sheets.add(new StreamingSheet(sheetProperties.get(i++).get("name"), new StreamingSheetReader(sst, stylesTable, parser, use1904Dates, rowCacheSize))); } }