/** A convenience method. Creates an empty Parse instance, * which returns this status. */ public Parse getEmptyParse(Configuration conf) { return new EmptyParseImpl(this, conf); }
public static ParseStatus read(DataInput in) throws IOException { ParseStatus res = new ParseStatus(); res.readFields(in); return res; }
public final static ParseText read(DataInput in) throws IOException { ParseText parseText = new ParseText(); parseText.readFields(in); return parseText; }
@Override public void processData(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) { // Get metadata Metadata metadata = parse.getData().getParseMeta(); try { // Initialize the writers // Only on the first execution initWriters(); for(Entry entry : entryList) { CSVPrint csvPrint = nameCsvPrintMap.get(entry.getParameterMap().get(NAME)); String[] fieldValues = new String[entry.getFieldList().size()]; List<Field> fieldList = entry.getFieldList(); for (int i = 0; i < fieldList.size(); i++) { fieldValues[i] = FilterUtils.getNullSafe(metadata.get(fieldList.get(i).getName()), ""); } // Write field values to CSV file csvPrint.println(fieldValues); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
@SuppressWarnings("rawtypes") @Override public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { Metadata metadata = parseResult.get(content.getUrl()).getData().getParseMeta(); byte[] rawContent = content.getContent(); System.err.println(e.getMessage()); log.error("Error parsing urlRegex: " + e.getMessage()); return new ParseStatus(ParseStatus.FAILED, "Error parsing urlRegex: " + e.getMessage()).getEmptyParseResult(content.getUrl(), configuration); } catch (ParserConfigurationException e) { System.err.println(e.getMessage()); log.error("HTML Cleaning error: " + e.getMessage()); return new ParseStatus(ParseStatus.FAILED, "HTML Cleaning error: " + e.getMessage()).getEmptyParseResult(content.getUrl(), configuration); } catch (SAXException e) { System.err.println(e.getMessage()); log.error("XML parsing error: " + e.getMessage()); return new ParseStatus(ParseStatus.FAILED, "XML parsing error: " + e.getMessage()).getEmptyParseResult(content.getUrl(), configuration); } catch (JaxenException e) { System.err.println(e.getMessage()); log.error("XPath error: " + e.getMessage()); return new ParseStatus(ParseStatus.FAILED, "XPath error: " + e.getMessage()).getEmptyParseResult(content.getUrl(), configuration);
ParseText text = new ParseText(txtContent[0]); outlinks[i] = new Outlink(indexedUrls.get(olinks.refs[i]).toString()); StringUtil.toHexString(MD5Hash.digest(txtContent[0].getBytes()).getDigest())); ParseData data = new ParseData(new ParseStatus(ParseStatus.SUCCESS), txtContent[1], outlinks, contentMeta, new Metadata());
public static Outlink read(DataInput in) throws IOException { Outlink outlink = new Outlink(); outlink.readFields(in); return outlink; }
public final void write(DataOutput out) throws IOException { out.writeByte(VERSION); // write version status.write(out); // write status Text.writeString(out, title); // write title out.writeInt(outlinks.length); // write outlinks for (int i = 0; i < outlinks.length; i++) { outlinks[i].write(out); } contentMeta.write(out); // write content metadata parseMeta.write(out); }
public final void readFields(DataInput in) throws IOException { version = in.readByte(); // incompatible change from UTF8 (version < 5) to Text if (version != VERSION) throw new VersionMismatchException(VERSION, version); status = ParseStatus.read(in); title = Text.readString(in); // read title int numOutlinks = in.readInt(); outlinks = new Outlink[numOutlinks]; for (int i = 0; i < numOutlinks; i++) { outlinks[i] = Outlink.read(in); } if (version < 3) { int propertyCount = in.readInt(); // read metadata contentMeta.clear(); for (int i = 0; i < propertyCount; i++) { contentMeta.add(Text.readString(in), Text.readString(in)); } } else { contentMeta.clear(); contentMeta.readFields(in); } if (version > 3) { parseMeta.clear(); parseMeta.readFields(in); } }
public EmptyParseImpl(ParseStatus status, Configuration conf) { data = new ParseData(status, "", new Outlink[0], new Metadata(), new Metadata()); }
Metadata metadata = parse.getData().getParseMeta();
@Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { Metadata metadata = parse.getData().getParseMeta();