private String detect(String content) { String language = null; try { Response response = WebClient .create(restHostUrlStr + TEXT_LID_PATH) .put(content); String json = response.readEntity(String.class); language = new JsonParser().parse(json).getAsJsonObject().get("language").getAsString(); } catch (Exception e) { LOG.warn("problem detecting", e); } return language; }
@Test public void testExeDOCX() throws Exception { String TEST_DOCX_EXE = "2exe.docx"; Response response = WebClient.create(endPoint + UNPACKER_PATH) .accept("application/zip") .put(ClassLoader.getSystemResourceAsStream(TEST_DOCX_EXE)); Map<String, String> data = readZipArchive((InputStream) response.getEntity()); assertEquals(DOCX_EXE1_MD5, data.get(DOCX_EXE1_NAME)); assertEquals(DOCX_EXE2_MD5, data.get(DOCX_EXE2_NAME)); }
@Test public void testImageXSL() throws Exception { Response response = WebClient.create(endPoint + UNPACKER_PATH) .accept("application/zip") .put(ClassLoader.getSystemResourceAsStream("pic.xls")); Map<String, String> data = readZipArchive((InputStream) response.getEntity()); assertEquals(XSL_IMAGE1_MD5, data.get("0.jpg")); assertEquals(XSL_IMAGE2_MD5, data.get("1.jpg")); }
@Test public void testGetField_XXX_NotFound() throws Exception { Response response = WebClient.create(endPoint + META_PATH + "/xxx").type("application/msword") .accept(MediaType.APPLICATION_JSON).put(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC)); Assert.assertEquals(Response.Status.NOT_FOUND.getStatusCode(), response.getStatus()); }
@Test public void testTextMain() throws Exception { //boilerpipe Response response = WebClient.create(endPoint + TIKA_PATH + "/main") .accept("text/plain") .put(ClassLoader.getSystemResourceAsStream("testHTML.html")); String responseMsg = getStringFromInputStream((InputStream) response .getEntity()); assertTrue(responseMsg.contains("Title : Test Indexation Html")); assertFalse(responseMsg.contains("Indexation du fichier")); }
@Test public void testImageDOCX() throws Exception { Response response = WebClient.create(endPoint + UNPACKER_PATH) .accept("application/zip").put( ClassLoader.getSystemResourceAsStream(TEST_DOCX_IMAGE)); Map<String, String> data = readZipArchive((InputStream) response.getEntity()); assertEquals(DOCX_IMAGE1_MD5, data.get(DOCX_IMAGE1_NAME)); assertEquals(DOCX_IMAGE2_MD5, data.get(DOCX_IMAGE2_NAME)); }
@Test public void testPasswordXLSHTML() throws Exception { Response response = WebClient.create(endPoint + TIKA_PATH) .type("application/vnd.ms-excel") .accept("text/html") .put(ClassLoader.getSystemResourceAsStream("password.xls")); assertEquals(UNPROCESSEABLE, response.getStatus()); }
@Test public void testDocWAV() throws Exception { Response response = WebClient.create(endPoint + UNPACKER_PATH) .type(APPLICATION_MSWORD).accept("application/zip") .put(ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV)); Map<String, String> data = readZipArchive((InputStream) response.getEntity()); assertEquals(WAV1_MD5, data.get(WAV1_NAME)); assertEquals(WAV2_MD5, data.get(WAV2_NAME)); assertFalse(data.containsKey(UnpackerResource.TEXT_FILENAME)); }
@Test public void testSimpleWordXML() throws Exception { Response response = WebClient.create(endPoint + TIKA_PATH) .type("application/msword") .accept("text/xml") .put(ClassLoader.getSystemResourceAsStream(TEST_DOC)); String responseMsg = getStringFromInputStream((InputStream) response .getEntity()); assertTrue(responseMsg.contains("test")); }
@Test public void testPasswordXLSXML() throws Exception { Response response = WebClient.create(endPoint + TIKA_PATH) .type("application/vnd.ms-excel") .accept("text/xml") .put(ClassLoader.getSystemResourceAsStream("password.xls")); assertEquals(UNPROCESSEABLE, response.getStatus()); }
@Test public void testDocPicture() throws Exception { Response response = WebClient.create(endPoint + UNPACKER_PATH) .type(APPLICATION_MSWORD).accept("application/zip") .put(ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV)); Map<String, String> data = readZipArchive((InputStream) response.getEntity()); assertEquals(JPG_MD5, data.get(JPG_NAME)); }
@Test public void testDocPictureNoOle() throws Exception { Response response = WebClient.create(endPoint + UNPACKER_PATH) .type(APPLICATION_MSWORD).accept("application/zip") .put(ClassLoader.getSystemResourceAsStream("2pic.doc")); Map<String, String> data = readZipArchive((InputStream) response.getEntity()); assertEquals(JPG2_MD5, data.get(JPG2_NAME)); }
@Test public void testMeta() throws Exception { InputStream stream = ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC); Response response = WebClient.create(endPoint + "/meta" + "/Author").type("application/msword") .accept(MediaType.TEXT_PLAIN).put(copy(stream, 8000)); Assert.assertEquals(Response.Status.BAD_REQUEST.getStatusCode(), response.getStatus()); String msg = getStringFromInputStream((InputStream) response.getEntity()); assertEquals("Failed to get metadata field Author", msg); } }
@Test public void testGetField_Author_XMP_Partial_Found() throws Exception { InputStream stream = ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC); Response response = WebClient.create(endPoint + META_PATH + "/dc:creator").type("application/msword") .accept("application/rdf+xml").put(copy(stream, 12000)); Assert.assertEquals(Response.Status.OK.getStatusCode(), response.getStatus()); String s = IOUtils.readStringFromStream((InputStream) response.getEntity()); assertContains("<rdf:li>Maxim Valyanskiy</rdf:li>", s); }
@Test public void testXMP() throws Exception { Response response = WebClient .create(endPoint + META_PATH) .type("application/msword") .accept("application/rdf+xml") .put(ClassLoader .getSystemResourceAsStream(TikaResourceTest.TEST_DOC)); String result = IOUtils.readStringFromStream((InputStream) response.getEntity()); assertContains("<rdf:li>Maxim Valyanskiy</rdf:li>", result); }
@Test public void testDetectEnglishFile() throws Exception { String url = endPoint + LANG_STREAM_PATH; Response response = WebClient.create(url).type("text/plain") .accept("text/plain") .put(ClassLoader.getSystemResourceAsStream("english.txt")); assertNotNull(response); String readLang = getStringFromInputStream((InputStream) response .getEntity()); assertEquals("en", readLang); }
@Test public void testGetField_Author_JSON_Partial_Found() throws Exception { InputStream stream = ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC); Response response = WebClient.create(endPoint + META_PATH + "/"+TikaCoreProperties.CREATOR.getName()) .type("application/msword") .accept(MediaType.APPLICATION_JSON).put(copy(stream, 12000)); Assert.assertEquals(Response.Status.OK.getStatusCode(), response.getStatus()); Metadata metadata = JsonMetadata.fromJson(new InputStreamReader( (InputStream) response.getEntity(), UTF_8)); assertEquals("Maxim Valyanskiy", metadata.get(TikaCoreProperties.CREATOR)); assertEquals(1, metadata.names().length); }
@Test public void testDetectFrenchString() throws Exception { String url = endPoint + LANG_STRING_PATH; Response response = WebClient.create(url).type("text/plain") .accept("text/plain").put(FRENCH_STRING); assertNotNull(response); String readLang = getStringFromInputStream((InputStream) response .getEntity()); assertEquals("fr", readLang); }
@Test public void testTranslateFull() throws Exception { String url = endPoint + TRANSLATE_ALL_PATH + LINGO_PATH + SRCDEST; Response response = WebClient.create(url).type("text/plain") .accept("*/*").put(TRANSLATE_TXT); assertNotNull(response); String translated = getStringFromInputStream((InputStream) response .getEntity()); assertEquals(TRANSLATE_TXT, translated); }
@Test public void testDetectEnglishString() throws Exception { String url = endPoint + LANG_STRING_PATH; Response response = WebClient.create(url).type("text/plain") .accept("text/plain").put(ENGLISH_STRING); assertNotNull(response); String readLang = getStringFromInputStream((InputStream) response .getEntity()); assertEquals("en", readLang); }