doi = dm.getBasicMetadata().getDoi().replaceAll("\\s++", " ").trim(); if(doi.length()==0){ throw new Exception("Lack of doi"); for (Author a : dm.getBasicMetadata().getAuthorList()) { try { String sname = a.getSurname();
for(TextWithLanguage twl : dm.getBasicMetadata().getTitleList()){ if(twl.getLanguage().toLowerCase().startsWith("en")){ title=twl.getText(); title = dm.getBasicMetadata().getTitle(0).getText(); doi = dm.getBasicMetadata().getDoi().replaceAll("\\s++", " ").trim(); }catch(Exception e){ }finally{ year = dm.getBasicMetadata().getYear().replaceAll("\\s++", " ").trim(); }catch(Exception e){ }finally{
private String extractLangTitle(DocumentMetadata dm) { List<String> titleList = new ArrayList<String>(); for (TextWithLanguage title : dm.getBasicMetadata().getTitleList()) { if (language.equalsIgnoreCase(title.getLanguage())) { titleList.add(title.getText()); } } String docTitle; switch (titleList.size()) { case 0: logger.info("No title IN GIVEN LANG (" + language + ") out of " + dm.getBasicMetadata().getTitleCount() + " titles. Ignoring record!"); return null; case 1: docTitle = titleList.get(0); break; default: logger.info("Number of titles IN GIVEN LANGUAGE (" + language + ") is more then one. " + "Titles will be concatenated"); docTitle = Joiner.on(" ").join(titleList); break; } if (docTitle.trim().isEmpty()) { return null; } return docTitle; }
for (TextWithLanguage title : dm.getBasicMetadata().getTitleList()) { titleList.add(title.getText()); for (TextWithLanguage documentAbstract : dm.getBasicMetadata().getTitleList()) { abstractsList.add(documentAbstract.getText());
for (ClassifCode code : dm.getBasicMetadata().getClassifCodeList()) { for (String co_str : code.getValueList()) { bagsize++; for (TextWithLanguage title : dm.getBasicMetadata().getTitleList()) { titleList.add(title.getText()); for (TextWithLanguage documentAbstract : dm.getBasicMetadata().getTitleList()) { abstractsList.add(documentAbstract.getText());
if (dm.getBasicMetadata().getClassifCodeCount() > 0) { hasCateg = true; for (TextWithLanguage title : dm.getBasicMetadata().getTitleList()) { titleList.add(title.getText()); for (TextWithLanguage documentAbstract : dm.getBasicMetadata().getTitleList()) { abstractsList.add(documentAbstract.getText());
private static Pair<String[], Boolean> extractSurnames(DocumentProtos.DocumentMetadata doc) { RegexpParser authorParser = new RegexpParser("authorParser.properties", "author"); List<DocumentProtos.Author> authorList = doc.getBasicMetadata().getAuthorList(); String[] resultByPositionNb = new String[authorList.size()]; String[] resultByOrder = new String[authorList.size()]; for (DocumentProtos.Author author : doc.getBasicMetadata().getAuthorList()) { String surname; if (author.hasSurname()) {
doi = dm.getBasicMetadata().getDoi().replaceAll("\\s++", " ").trim(); }catch(Exception e){ }finally{
for (TextWithLanguage title : metadata.getBasicMetadata().getTitleList()) { titleList.add(title.getText()); for (TextWithLanguage documentAbstract : metadata.getBasicMetadata().getTitleList()) { abstractsList.add(documentAbstract.getText()); DataBag db = getCategories(metadata.getBasicMetadata().getClassifCodeList()); if (db.size() > 0) { Map<String, Object> map = new HashMap<String, Object>();
if(commonDocumentMetadata == null){ commonDocumentMetadata = DocumentMetadata.newBuilder(documentMetadata); commonBasicMetadata = BasicMetadata.newBuilder(commonDocumentMetadata.getBasicMetadata()); continue; commonBasicMetadata.addAuthor(documentMetadata.getBasicMetadata().getAuthor(0));
for (TextWithLanguage title : dm.getBasicMetadata().getTitleList()) { titles.add(title.getText());
, Bytes.toBytes(HBaseConstant.FAMILY_METADATA_DOCUMENT_QUALIFIER_PROTO)) != null) { DocumentMetadata dm = DocumentMetadata.parseFrom(scannerResult.value()); for(Author a : dm.getBasicMetadata().getAuthorList()){ names.add(a.getForenames() + " " + a.getSurname());
for (TextWithLanguage title : metadata.getBasicMetadata().getTitleList()) { titleList.add(title.getText()); for (TextWithLanguage documentAbstract : metadata.getBasicMetadata().getTitleList()) { abstractsList.add(documentAbstract.getText()); map.put("keywords", getConcatenated(metadata.getKeywordsList())); map.put("abstract", abstracts); DataBag db = getCategories(metadata.getBasicMetadata().getClassifCodeList()); map.put("categories", db); long num = db.size();
@Override public Map exec(Tuple input) throws IOException { try { DataByteArray protoMetadata = (DataByteArray) input.get(0); DocumentMetadata metadata = DocumentMetadata.parseFrom(protoMetadata.get()); String titles; String abstracts; List<String> titleList = new ArrayList<String>(); for (TextWithLanguage title : metadata.getBasicMetadata().getTitleList()) { titleList.add(title.getText()); } titles = Joiner.on(" ").join(titleList); List<String> abstractsList = new ArrayList<String>(); for (TextWithLanguage documentAbstract : metadata.getBasicMetadata().getTitleList()) { abstractsList.add(documentAbstract.getText()); } abstracts = Joiner.on(" ").join(abstractsList); Map<String, Object> map = new HashMap<String, Object>(); map.put("key", metadata.getKey()); map.put("title", titles); map.put("keywords", getConcatenated(metadata.getKeywordsList())); map.put("abstract", abstracts); map.put("categories", getCategories(metadata.getBasicMetadata().getClassifCodeList())); return map; } catch (Exception e) { logger.error("Error in processing input row:", e); throw new IOException("Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e)); } }
List<Author> aths = dw.getDocumentMetadata().getBasicMetadata().getAuthorList();
documentMetadata.addCollection("orcid"); BasicMetadata.Builder basicMetadata = BasicMetadata.newBuilder(); basicMetadata.setDoi(doi);
private static String extractDOI(DocumentProtos.DocumentMetadata doc) { DocumentProtos.BasicMetadata basicMetadata = doc.getBasicMetadata(); if (!basicMetadata.hasDoi()) { return null; } String rawDoi = basicMetadata.getDoi().trim(); String[] splittedDoi = rawDoi.split("\\|"); if (splittedDoi.length == 2 && (splittedDoi[0].equals(splittedDoi[1]) || splittedDoi[1].startsWith("issn"))) { rawDoi = splittedDoi[0]; } else if (rawDoi.length() % 2 == 0) { String firstHalf = rawDoi.substring(0, rawDoi.length() / 2); String secondHalf = rawDoi.substring(rawDoi.length() / 2); if (firstHalf.equals(secondHalf)) { rawDoi = firstHalf; } } String doiregex = ".*?(10[.][0-9]{4,}[^\\s\"/<>]*/[^\\s\"]+[^\\s\"\\]\\.;]).*"; Pattern doiPattern = Pattern.compile(doiregex); Matcher matcher = doiPattern.matcher(rawDoi); if (matcher.matches()) { String doi = matcher.group(1); return doi; } else { return null; } } }
public Tuple exec(Tuple input) throws IOException { if (input == null || input.size() != 3) { return null; } try{ String key = (String) input.get(0); DocumentWrapper dw = DocumentWrapper.parseFrom(((DataByteArray) input.get(1)).get()); String correctedDoi = (String) input.get(2); DocumentWrapper.Builder dwb = DocumentWrapper.newBuilder(dw); DocumentMetadata.Builder dmb = DocumentMetadata.newBuilder(dw.getDocumentMetadata()); BasicMetadata.Builder bmb = BasicMetadata.newBuilder(dmb.getBasicMetadata()); bmb.setDoi(correctedDoi); dmb.setBasicMetadata(bmb); dwb.setDocumentMetadata(dmb); Tuple ret = TupleFactory.getInstance().newTuple(); ret.append(key); ret.append(new DataByteArray(dwb.build().toByteArray())); return ret; }catch(Exception e){ logger.error("Error in processing input row:"+ StackTraceExtractor.getStackTrace(e), e); throw new IOException("Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e)); } } }
private Tuple addDocumentMetatdataFields(DocumentMetadata metadata, Tuple output) throws ExecException { output.set(fieldNumberMap.get(C.KEY), metadata.getKey()); appendToOutput(output, C.TITLE, metadata.getBasicMetadata() .getTitleList()); appendToOutput(output, C.ABSTRACT_TEXT, metadata.getDocumentAbstractList()); List<String> al = new ArrayList<String>(); for (KeywordsList kl : metadata.getKeywordsList()) { for (String s : kl.getKeywordsList()) { al.add(removeAllPigUnfriendlySigns(s)); } } output.set(fieldNumberMap.get(C.KEYWORDS), listToDataBag(al)); List<String> authorKeys = new ArrayList<String>(); List<String> authorNames = new ArrayList<String>(); for (Author author : metadata.getBasicMetadata().getAuthorList()) { authorKeys.add(author.getKey()); authorNames.add(author.getName()); } output.set(fieldNumberMap.get(C.CONTRIBUTORS), listToDataBag(authorKeys, authorNames)); return output; }
@Override public boolean transform(Media media, String docNewId, DocumentMetadata.Builder dmBuider,DocumentProtos.DocumentWrapper.Builder builder) { byte[] b = media.getContent().toByteArray(); byte[] c = Arrays.copyOf(b, b.length); ScholarRecordP record; BasicMetadata.Builder bmBuilder = BasicMetadata.newBuilder(); try { record = ScholarRecordP.parseFrom(c); } catch (InvalidProtocolBufferException ex) { java.util.logging.Logger.getLogger(GsMediaToBw2Metadata.class.getName()).log(Level.SEVERE, null, ex); return false; } AuthorData ad = new AuthorData(); ad.docId = docNewId; if (translateGoogleScholarToDocumentMetadata(record, dmBuider, bmBuilder, ad)) { dmBuider.setBasicMetadata(bmBuilder); return true; } return false; } }