public MboxIterator build() throws FileNotFoundException, IOException { return new MboxIterator(file, charset, regexpPattern, flags, maxMessageSize); } }
public static Builder fromFile(File filePath) { return new Builder(filePath); }
public Iterator<CharBufferWrapper> iterator() { return new MessageIterator(); }
@Override protected void doInitialize(UimaContext context) throws ResourceInitializationException { // Initialise charset for MBOX processing try { charset = Charset.forName(charsetName); } catch (UnsupportedCharsetException | IllegalCharsetNameException ce) { getMonitor().warn("Unsupported charset, {}. UTF-8 will be used.", charsetName, ce); charset = StandardCharsets.UTF_8; } // Initialise MBOX iterator try { mboxIterator = MboxIterator.fromFile(mbox) .charset(charset) .maxMessageSize(messageSize) .build() .iterator(); } catch (IOException ioe) { throw new ResourceInitializationException(ioe); } // Initialise message parser messageBuilder = new DefaultMessageBuilder(); messageBuilder.setContentDecoding(true); MimeConfig config = new MimeConfig.Builder().setMaxLineLen(10000).build(); messageBuilder.setMimeEntityConfig(config); // Build list of extensions to ignore for (String s : ignoreExtensions) { ignoreExtensionsList.add(s.trim().toLowerCase()); } }
public static void main(String[] args) throws Exception { if (args.length != 1) { System.out.println("Please supply a path to an mbox file to parse"); } final File mbox = new File(args[0]); long start = System.currentTimeMillis(); int count = 0; for (CharBufferWrapper message : MboxIterator.fromFile(mbox).charset(ENCODER.charset()).build()) { // saveMessageToFile(count, buf); System.out.println(messageSummary(message.asInputStream(ENCODER.charset()))); count++; } System.out.println("Found " + count + " messages"); long end = System.currentTimeMillis(); System.out.println("Done in: " + (end - start) + " milis"); }
/** * initialize the Mailbox iterator * * @throws IOException * @throws CharConversionException */ protected void initMboxIterator() throws IOException { decodeNextCharBuffer(); fromLineMatcher = MESSAGE_START.matcher(mboxCharBuffer); fromLineFound = fromLineMatcher.find(); if (fromLineFound) { saveFindPositions(fromLineMatcher); } else if (fromLineMatcher.hitEnd()) { String path = ""; if (mbox != null) path = mbox.getPath(); throw new IllegalArgumentException("File " + path + " does not contain From_ lines that match the pattern '" + MESSAGE_START.pattern() + "'! Maybe not be a valid Mbox or wrong matcher."); } }
} else { String raw = mboxIterator.next().toString(); count++;
public boolean hasNext() { if (!fromLineFound) { try { close(); } catch (IOException e) { throw new RuntimeException("Exception closing file!"); } } return fromLineFound; }
public InputStream asInputStream(Charset encoding) { return new ByteBufferInputStream(encoding.encode(messageBuffer)); }
private MboxIterator(final File mbox, final Charset charset, final String regexpPattern, final int regexpFlags, final int MAX_MESSAGE_SIZE) throws FileNotFoundException, IOException, CharConversionException { //TODO: do better exception handling - try to process some of them maybe? this.maxMessageSize = MAX_MESSAGE_SIZE; this.MESSAGE_START = Pattern.compile(regexpPattern, regexpFlags); this.DECODER = charset.newDecoder(); this.mboxCharBuffer = CharBuffer.allocate(MAX_MESSAGE_SIZE); this.mbox = mbox; this.theFile = new FileInputStream(mbox); this.byteBuffer = theFile.getChannel().map(FileChannel.MapMode.READ_ONLY, 0, theFile.getChannel().size()); initMboxIterator(); }
private void decodeNextCharBuffer() throws CharConversionException { CoderResult coderResult = DECODER.decode(byteBuffer, mboxCharBuffer, endOfInputFlag); updateEndOfInputFlag(); mboxCharBuffer.flip(); if (coderResult.isError()) { if (coderResult.isMalformed()) { throw new CharConversionException("Malformed input!"); } else if (coderResult.isUnmappable()) { throw new CharConversionException("Unmappable character!"); } } }
return new CharBufferWrapper(message);
MboxIterator.fromFile(mbox) .charset(charset) .maxMessageSize(messageSize) .build() .iterator(); } catch (IOException ioe) { throw new ResourceInitializationException(ioe);
public static Builder fromFile(String file) { return new Builder(file); }
} else { String raw = mboxIterator.next().toString(); count++;