public void parse(String filePath, ContentHandler handler, Metadata metadata, ParseContext context) throws FileNotFoundException { File pdfFile = new File(filePath); ContentDisposition cd = new ContentDisposition( "form-data; name=\"input\"; filename=\"" + pdfFile.getName() + "\""); Attachment att = new Attachment("input", new FileInputStream(pdfFile), cd); MultipartBody body = new MultipartBody(att); Response response = WebClient .create(restHostUrlStr + GROBID_PROCESSHEADER_PATH) .accept(MediaType.APPLICATION_XML).type(MediaType.MULTIPART_FORM_DATA) .post(body); try { String resp = response.readEntity(String.class); Metadata teiMet = new TEIDOMParser().parse(resp, context); for (String key : teiMet.names()) { metadata.add("grobid:header_" + key, teiMet.get(key)); } } catch (Exception e) { LOG.warn("Couldn't read response", e); } }
private void configureAllowOrigins(boolean allOrigins, String[] originList) { if (allOrigins) { originList = new String[0]; } // tell filter what to do. String confResult = configClient.accept("text/plain").replacePath("/setOriginList") .type("application/json").post(originList, String.class); assertEquals("ok", confResult); }
@Test public void testGetField_XXX_NotFound() throws Exception { Response response = WebClient.create(endPoint + META_PATH + "/xxx").type("application/msword") .accept(MediaType.APPLICATION_JSON).put(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC)); Assert.assertEquals(Response.Status.NOT_FOUND.getStatusCode(), response.getStatus()); }
providers.add(jacksonJsonProvider); client = WebClient.create(networkURI, providers); jsonNode.put("inputText", inputText); Response response = client.accept(MediaType.APPLICATION_JSON).type(MediaType.APPLICATION_JSON).post(jsonNode); BufferedReader reader = new BufferedReader(new InputStreamReader( (InputStream) response.getEntity(), UTF_8));
@Test public void testGetField_Author_TEXT_Partial_BAD_REQUEST() throws Exception { InputStream stream = ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC); Response response = WebClient.create(endPoint + META_PATH + "/Author").type("application/msword") .accept(MediaType.TEXT_PLAIN).put(copy(stream, 8000)); Assert.assertEquals(Response.Status.BAD_REQUEST.getStatusCode(), response.getStatus()); }
@Test public void testFloatInHeader() { Response response = WebClient.create(endPoint + TIKA_PATH) .type("application/pdf") .accept("text/plain") .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX + "averageCharTolerance", "2.0") .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf")); assertEquals(200, response.getStatus()); }
@Test public void testWMFInRTF() throws Exception { Response response = WebClient.create(endPoint + TIKA_PATH) .type("application/rtf") .accept("text/plain") .put(ClassLoader.getSystemResourceAsStream("testRTF_npeFromWMFInTikaServer.rtf")); String responseMsg = getStringFromInputStream((InputStream) response .getEntity()); assertTrue(responseMsg.contains("Example text")); }
@Test public void testTrustedMethodPrevention() { Response response = WebClient.create(endPoint + TIKA_PATH) .type("application/pdf") .accept("text/plain") .header(TikaResource.X_TIKA_OCR_HEADER_PREFIX + "trustedPageSeparator", "\u0020") .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf")); assertEquals(500, response.getStatus()); }
@Test public void testSimpleWordXML() throws Exception { Response response = WebClient.create(endPoint + TIKA_PATH) .type("application/msword") .accept("text/xml") .put(ClassLoader.getSystemResourceAsStream(TEST_DOC)); String responseMsg = getStringFromInputStream((InputStream) response .getEntity()); assertTrue(responseMsg.contains("test")); }
@Test public void testDocPicture() throws Exception { Response response = WebClient.create(endPoint + UNPACKER_PATH) .type(APPLICATION_MSWORD).accept("application/zip") .put(ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV)); Map<String, String> data = readZipArchive((InputStream) response.getEntity()); assertEquals(JPG_MD5, data.get(JPG_NAME)); }
@Test public void testDocPictureNoOle() throws Exception { Response response = WebClient.create(endPoint + UNPACKER_PATH) .type(APPLICATION_MSWORD).accept("application/zip") .put(ClassLoader.getSystemResourceAsStream("2pic.doc")); Map<String, String> data = readZipArchive((InputStream) response.getEntity()); assertEquals(JPG2_MD5, data.get(JPG2_NAME)); }
@Test public void testSimpleWord() throws Exception { Response response = WebClient.create(endPoint + TIKA_PATH) .type("application/msword") .accept("text/plain") .put(ClassLoader.getSystemResourceAsStream(TEST_DOC)); String responseMsg = getStringFromInputStream((InputStream) response .getEntity()); assertTrue(responseMsg.contains("test")); }
@Test public void testXMP() throws Exception { Response response = WebClient .create(endPoint + META_PATH) .type("application/msword") .accept("application/rdf+xml") .put(ClassLoader .getSystemResourceAsStream(TikaResourceTest.TEST_DOC)); String result = IOUtils.readStringFromStream((InputStream) response.getEntity()); assertContains("<rdf:li>Maxim Valyanskiy</rdf:li>", result); }
@Test public void testTextMainMultipart() throws Exception { //boilerpipe Attachment attachmentPart = new Attachment("myhtml", "text/html", ClassLoader.getSystemResourceAsStream("testHTML.html")); Response response = WebClient.create(endPoint + TIKA_PATH+"/form/main") .type("multipart/form-data") .accept("text/plain") .post(attachmentPart); String responseMsg = getStringFromInputStream((InputStream) response .getEntity()); assertTrue(responseMsg.contains("Title : Test Indexation Html")); assertFalse(responseMsg.contains("Indexation du fichier")); }
@Test public void testSimpleWordMultipartXML() throws Exception { ClassLoader.getSystemResourceAsStream(TEST_DOC); Attachment attachmentPart = new Attachment("myworddoc", "application/msword", ClassLoader.getSystemResourceAsStream(TEST_DOC)); WebClient webClient = WebClient.create(endPoint + TIKA_PATH + "/form"); Response response = webClient.type("multipart/form-data") .accept("text/xml") .post(attachmentPart); String responseMsg = getStringFromInputStream((InputStream) response .getEntity()); assertTrue(responseMsg.contains("test")); assertContains("<meta name=\"X-TIKA:digest:MD5\" content=\"f8be45c34e8919eedba48cc8d207fbf0\"/>", responseMsg); }