private Tika() { tika.setMaxStringLength(-1); }
/** * Parses the file at the given path and returns the extracted text content. * * @param path the path of the file to be parsed * @return extracted text content * @throws IOException if the file can not be read or parsed */ public Reader parse(Path path) throws IOException { return parse(path, new Metadata()); }
private void doACloseBehaviorTest(boolean wantToClose) throws IOException { TestInputStream tis = createTestInputStream(); RereadableInputStream ris = new RereadableInputStream(tis, 5, true, wantToClose); ris.close(); assertEquals(wantToClose, tis.isClosed()); if (! tis.isClosed()) { tis.close(); } }
public void testDetector(Detector detector, int numThreads, int numIterations, FileFilter filter, int randomlyResizeSAXPool) throws Exception { Path[] files = getTestFiles(filter); testDetectorEach(detector, files, numThreads, numIterations, randomlyResizeSAXPool); testDetectorOnAll(detector, files, numThreads, numIterations, randomlyResizeSAXPool); }
/** * This calls {@link #testEach(Parser parser, Path[], ParseContext[], int, int)} and * then {@link #testAll(Parser parser, Path[], ParseContext[], int, int)} * * @param numThreads number of threads to use * @param numIterations number of iterations per thread * @param filter file filter to select files from "/test-documents"; if <code>null</code>, * all files will be used * @throws Exception */ protected void testMultiThreaded(Parser parser, ParseContext[] parseContext, int numThreads, int numIterations, FileFilter filter) throws Exception { Path[] allFiles = getTestFiles(filter); testEach(parser, allFiles, parseContext, numThreads, numIterations); testAll(parser, allFiles, parseContext, numThreads, numIterations); }
protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata, boolean suppressException) throws Exception { try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { return getRecursiveMetadata(is, context, metadata, suppressException); } }
protected XMLResult getXML(String filePath, Parser parser, Metadata metadata) throws Exception { return getXML(getResourceAsStream("/test-documents/" + filePath), parser, metadata, null); }
private TestInputStream createTestInputStream() throws IOException { return new TestInputStream( new BufferedInputStream( new FileInputStream(createTestFile()))); }
protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception { return getRecursiveMetadata(filePath, context, metadata, false); }
void testDetectorEach(Detector detector, Path[] files, int numThreads, int numIterations, int randomlyResizeSAXPool) { for (Path p : files) { Path[] toTest = new Path[1]; toTest[0] = p; testDetectorOnAll(detector, toTest, numThreads, numIterations, randomlyResizeSAXPool); } }
protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception { return getXML(input, parser, metadata, null); }
/** * Test each file, one at a time in multiple threads. * This was required to test TIKA-2519 in a reasonable * amount of time. This forced the parser to use the * same underlying memory structures because it was the same file. * This is stricter than I think our agreement with clients is * because this run tests on literally the same file and * not a copy of the file per thread. Let's leave this as is * unless there's a good reason to create a separate copy per thread. * * @param files files to test, one at a time * @param numThreads number of threads to use * @param numIterations number of iterations per thread */ protected void testEach(Parser parser, Path[] files, ParseContext[] parseContext, int numThreads, int numIterations) { for (Path p : files) { Path[] toTest = new Path[1]; toTest[0] = p; testAll(parser, toTest, parseContext, numThreads, numIterations); } }
/** * Test that the constructor's readToEndOfStreamOnFirstRewind parameter * correctly determines the behavior. * * @throws IOException */ @Test public void testRewind() throws IOException { doTestRewind(true); doTestRewind(false); }
@Test public void testCloseBehavior() throws IOException { doACloseBehaviorTest(true); doACloseBehaviorTest(false); }
/** * Parses the given file and returns the extracted text content. * * @param file the file to be parsed * @return extracted text content * @throws IOException if the file can not be read or parsed * @see #parse(Path) */ public Reader parse(File file) throws IOException { return parse(file, new Metadata()); }
protected void getMimeType(InputStream inputStream, String fileName, StaticAsset newAsset) { Tika tika = new Tika(); String tikaMimeType = tika.detect(fileName); if (tikaMimeType == null) { try { tikaMimeType = tika.detect(inputStream); } catch (IOException e) { //if tika can't resolve, don't throw exception } } if (tikaMimeType != null) { newAsset.setMimeType(tikaMimeType); } }
protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap) throws Exception { return getRecursiveMetadata(filePath, parserToWrap, BasicContentHandlerFactory.HANDLER_TYPE.XML); }