public static Corpus create(final boolean recursive, final Object... filesOrDirs) { return create(recursive, Arrays.asList(filesOrDirs)); }
@Override public KAFDocument next() { return get(this.index++); }
while (true) { final int i = counter.getAndIncrement(); if (i >= Runner.this.corpus.size()) { break; final Path path = Runner.this.corpus.file(i); final Path base = Runner.this.corpus.path(); final Path relative = base.toAbsolutePath().relativize( path.toAbsolutePath()); final KAFDocument document = Runner.this.corpus.get(i); docName = document.getPublic().publicId; MDC.put("context", docName);
Boolean recursive = cmd.hasOption("recursive"); Corpus corpus = Corpus.create(recursive, inputFolder); corpus.parallelStream().forEach(document -> { if (document != null) { tokens.addAndGet(document.getTerms().size());
@Override public String toString() { if (this.files.length == 0) { return "Empty corpus"; } else { return this.files.length + " document(s) corpus (path: " + path() + ")"; } }
public Corpus transform(final BiConsumer<Path, KAFDocument> transformer) { return new Corpus(this.files, this.transformer == null ? transformer : this.transformer.andThen(transformer)); }
public Stream<KAFDocument> stream() { return StreamSupport.stream(spliterator(), false); }
final String relativePath = file.toString().substring(path().toString().length()); document.getPublic().publicId = relativePath; if ("http://www.example.com".equals(document.getPublic().uri)) {
public Corpus[] split(@Nullable final Long shuffleSeed, final float... percentages) { // Shuffle the files if necessary, using the supplied seed Path[] files = this.files; if (shuffleSeed != null) { final List<Path> list = Lists.newArrayList(files); final Random random = new Random(shuffleSeed); Collections.shuffle(list, random); files = list.toArray(new Path[list.size()]); } // Split the (shuffled) file array based on supplied percentages final Corpus[] corpora = new Corpus[percentages.length]; int index = 0; float cumulated = 0.0f; for (int i = 0; i < percentages.length; ++i) { cumulated += percentages[i]; if (cumulated > 1.0f) { throw new IllegalArgumentException("Invalid percentages (sum must be 1.0f): " + Arrays.toString(percentages)); } final int endIndex = (int) Math.ceil(files.length * cumulated); final Path[] partition = Arrays.copyOfRange(files, index, endIndex); if (shuffleSeed != null) { Arrays.sort(partition); } corpora[i] = new Corpus(partition, this.transformer); index = endIndex; } return corpora; }
public Stream<KAFDocument> parallelStream() { return StreamSupport.stream(spliterator(), true); }
Iterable<KAFDocument> corpus = Corpus.create(false, inputFolder);
@Override public boolean tryAdvance(final Consumer<? super KAFDocument> action) { return delegate.tryAdvance(file -> { action.accept(get(file)); }); }
public static Corpus create(final boolean recursive, final Iterable<?> filesOrDirs) { final List<Path> paths = Lists.newArrayList(); for (final Object fileOrDir : filesOrDirs) { if (fileOrDir instanceof Path) { paths.add((Path) fileOrDir); } else if (fileOrDir instanceof File) { paths.add(((File) fileOrDir).toPath()); } else { paths.add(Paths.get(fileOrDir.toString())); } } // todo: this uses Util, a class included in utils-svm final List<Path> files = Util.fileMatch(paths, ImmutableList.of(".naf", ".naf.gz", ".naf.bz2", ".naf.xz", ".xml", ".xml.gz", ".xml.bz2", ".xml.xz"), recursive); for (int i = 0; i < files.size(); ++i) { files.set(i, files.get(i).toAbsolutePath().normalize()); } if (files.isEmpty()) { return EMPTY; } else { return new Corpus(files.toArray(new Path[files.size()]), null); } }
static Runner create(final String name, final String... args) { final Options options = Options.parse( "r,recursive|o,output!|m,merge|n,normalize|i,intermediate|+", args); final File outputFile = options.getOptionArg("o", File.class); final boolean recursive = options.hasOption("r"); final boolean merge = options.hasOption("m"); final boolean normalize = options.hasOption("n"); final boolean intermediate = options.hasOption("i"); final Corpus corpus = Corpus.create(recursive, options.getPositionalArgs(File.class)); final RDFGenerator generator = RDFGenerator.builder() .withProperties(Util.PROPERTIES, "eu.fbk.dkm.pikes.rdf.RDFGenerator") .withMerging(merge).withNormalization(normalize).build(); return new Runner(corpus, generator, outputFile, intermediate); }