private ArrayListMultimap<String, String> loadMultiMap(Path path) throws IOException { List<String> lines = TextIO.loadLines(path); return loadMultiMap(lines); }
List<PairRule> loadPairRule(Path path) throws IOException { List<String> lines = TextIO.loadLines(path, "#"); List<PairRule> rules = new ArrayList<>(); for (String line : lines) { PairRule rule = PairRule.fromLine(line); if (rule == null) { continue; } rules.add(rule); } return rules; }
public static Weights loadFromFile(Path file) throws IOException { List<String> all = TextIO.loadLines(file); return loadFromLines(all); }
static void createTestSet(Path p, Path labeled, Path out) throws IOException { List<String> allNone = TextIO.loadLines(p); allNone = allNone.stream().map(s -> s.replaceAll("[(].+?[)]", " ") .replaceAll("\\s+", " ").trim()) .collect(Collectors.toList()); List<String> test = new ArrayList<>(); for (String s : allNone) { test.add("__label__none " + s); } Random rnd = new Random(2345); List<String> allLabeled = TextIO.loadLines(labeled); for (String s : allNone) { ArrayList<String> tokens = new ArrayList<>(Splitter.on(" ").splitToList(s)); String rndLine = allLabeled.get(rnd.nextInt(allLabeled.size())); tokens.add(rnd.nextInt(tokens.size()), rndLine); test.add("__label__rec_notice " + String.join(" ", tokens)); } Files.write(out, test); }
static void generateTraining(Path labeled, Path junk, int junkCount, Path out) throws IOException { Random rnd = new Random(1234); List<String> allTrue = TextIO.loadLines(labeled); List<String> junkAll = TextIO.loadLines(junk); if (junkCount > junkAll.size()) { junkCount = junkAll.size(); } Collections.shuffle(junkAll, rnd); List<String> junkLabeled = new ArrayList<>(junkAll.subList(0, junkCount)); List<String> set = new ArrayList<>(); for (String s : junkLabeled) { set.add("__label__none " + s); } for (int i = 0; i < 5; i++) { for (String s : allTrue) { set.add("__label__rec_notice " + s); } } Collections.shuffle(set, rnd); Files.write(out, set); }
public static BlockTextLoader fromDirectoryRoot( Path corporaRoot, Path folderListFile, int blockSize) throws IOException { List<String> rootNames = TextIO.loadLines(folderListFile, "#"); List<Path> roots = new ArrayList<>(); rootNames.forEach(s -> roots.add(corporaRoot.resolve(s))); List<Path> corpora = new ArrayList<>(); for (Path corpusRoot : roots) { corpora.addAll(Files.walk(corpusRoot, 1) .filter(s -> s.toFile().isFile()) .collect(Collectors.toList())); } corpora.sort(Comparator.comparing(a -> a.toFile().getAbsolutePath())); Log.info("There are %d corpus files.", corpora.size()); return new BlockTextLoader(corpora, blockSize); }
static List<String> addLabels(Path input) throws IOException { List<String> lines = TextIO.loadLines(input); List<String> result = new ArrayList<>(); for (String line : lines) { int i = line.indexOf('\t'); if (i == -1) { continue; } String content = line.substring(0, i).trim(); normalizer.setAlwaysApplyDeasciifier(true); content = normalizer.normalize(content); String label = "__label__" + line.substring(i).trim(); result.add(label + " " + content); } return result; }
void generateData(int testSize) throws IOException { Path raw = root.resolve("raw3/all"); Random r = new Random(1); List<String> lines = TextIO.loadLines(raw); Collections.shuffle(lines, r); List<String> test = lines.subList(0, testSize); List<String> train = lines.subList(testSize, lines.size() - 1); Log.info("Train = %d, Test = %d lines.", train.size(), test.size()); train = train.stream() .filter(s -> s.contains("__label__")) .map(s -> s.replaceAll("^\"", "")) .map(s -> normalizer.normalize(s)) .collect(Collectors.toList()); test = test.stream() .filter(s -> s.contains("__label__")) .map(s -> s.replaceAll("^\"", "")) .map(s -> normalizer.normalize(s)) .collect(Collectors.toList()); Log.info("After pre-process, Train = %d, Test = %d lines.", train.size(), test.size()); Files.createDirectories(t1out); Files.write(trainRaw, train); Files.write(testRaw, test); }
public static BidirectionalIndexLookup<String> fromTextFileWithIndex(Path path, char delimiter) throws IOException { if (!path.toFile().exists()) { throw new IllegalArgumentException("File " + path + " does not exist."); } List<String> lines = TextIO.loadLines(path); UIntValueMap<String> indexLookup = new UIntValueMap<>(lines.size()); UIntMap<String> wordLookup = new UIntMap<>(lines.size()); for (String line : lines) { StringPair pair = StringPair.fromString(line, delimiter); String word = pair.first; int index = Integer.parseInt(pair.second); if (indexLookup.contains(word)) { throw new IllegalArgumentException("Duplicated word in line : [" + line + "]"); } if (wordLookup.containsKey(index)) { throw new IllegalArgumentException("Duplicated index in line : [" + line + "]"); } if (index < 0) { throw new IllegalArgumentException("Index Value cannot be negative : [" + line + "]"); } indexLookup.put(word, index); wordLookup.put(index, word); } return new BidirectionalIndexLookup<>(indexLookup, wordLookup); }
List<String> lines = TextIO.loadLines(path);
static List<SentenceDataStr> loadTrainingDataText(Path input) throws IOException { List<String> allLines = TextIO.loadLines(input);
static Model loadFromTextFile(Path file) throws IOException { FloatValueMap<String> data = new FloatValueMap<>(10000); List<String> all = TextIO.loadLines(file); for (String s : all) { float weight = Float.parseFloat(Strings.subStringUntilFirst(s, " ")); String key = Strings.subStringAfterFirst(s, " "); data.set(key, weight); } Log.info("Model Loaded."); return new Model(data); }
static void countTokens(Path... paths) throws IOException { for (Path path : paths) { List<String> lines = TextIO.loadLines(path); Histogram<String> hw = new Histogram<>(); Histogram<String> hl = new Histogram<>(); for (String l : lines) { for (String s : l.split("[\\s]+")) { if (s.contains("__label__")) { if(s.contains("-")) { Log.warn(l); } hl.add(s); } else { hw.add(s); } } } Log.info("There are %d lines, %d words, %d labels in %s", lines.size(), hw.size(), hl.size(), path); } }
public TurkishSentenceExtractor train() throws IOException { FloatValueMap<String> weights = new FloatValueMap<>(); List<String> sentences = TextIO.loadLines(builder.trainFile); FloatValueMap<String> averages = new FloatValueMap<>();
public TrainableTokenizer train() throws IOException { FloatValueMap<String> weights = new FloatValueMap<>(); List<String> sentences = TextIO.loadLines(builder.trainFile); FloatValueMap<String> averages = new FloatValueMap<>();