public StringGrid select(int column, String value) { StringGrid grid = new StringGrid(sep, numColumns); for (int i = 0; i < size(); i++) { List<String> row = get(i); if (row.get(column).equals(value)) { grid.addRow(row); } } return grid; }
public StringGrid getUniqueRows() { StringGrid ret = new StringGrid(this); ret.stripDuplicateRows(); return ret; }
public void dedupeByClusterAll() { for (int i = 0; i < size(); i++) dedupeByCluster(i); }
/** * Deduplicate based on the column clustering signature * @param column */ public void dedupeByCluster(int column) { StringCluster cluster = clusterColumn(column); System.out.println(cluster.get("family mcdonalds restaurant")); System.out.println(cluster.get("family mcdonalds restaurants")); List<Map<String, Integer>> list2 = cluster.getClusters(); for (int i = 0; i < list2.size(); i++) { if (list2.get(i).size() > 1) { System.out.println(list2.get(i)); } } FingerPrintKeyer keyer = new FingerPrintKeyer(); Set<Integer> alreadyDeDupped = new HashSet<>(); for (int i = 0; i < size(); i++) { String key = keyer.key(get(i).get(column)); Map<String, Integer> map = cluster.get(key); if (map != null && map.size() > 1) { List<Integer> list = filterRowsByColumn(column, map.keySet()); //deduplication to do if (list.size() > 1) modifyRows(alreadyDeDupped, column, list, map); } } }
public StringGrid getRowsWithDuplicateValuesInColumn(int column) { checkInvalidColumn(column); StringGrid grid = new StringGrid(sep, numColumns); List<String> columns = getColumn(column); Counter<String> counter = new Counter<>(); for (String val : columns) counter.incrementCount(val, 1.0f); counter.dropElementsBelowThreshold(2.0f); Set<String> keys = counter.keySet(); for (List<String> row : this) { for (String key : keys) if (row.get(column).equals(key)) grid.addRow(row); } return grid; }
public StringGrid getAllWithSimilarity(double threshold, int firstColumn, int secondColumn) { for (int column : new int[] {firstColumn, secondColumn}) checkInvalidColumn(column); StringGrid grid = new StringGrid(sep, numColumns); for (List<String> list : this) { double sim = MathUtils.stringSimilarity(list.get(firstColumn), list.get(secondColumn)); if (sim >= threshold) grid.addRow(list); } return grid; }
public List<String> getRow(int row) { checkInvalidRow(row); return new ArrayList<>(get(row)); }
public void filterBySimilarity(double threshold, int firstColumn, int secondColumn) { for (int column : new int[] {firstColumn, secondColumn}) checkInvalidColumn(column); List<List<String>> remove = new ArrayList<>(); for (List<String> list : this) { double sim = MathUtils.stringSimilarity(list.get(firstColumn), list.get(secondColumn)); if (sim < threshold) remove.add(list); } removeAll(remove); }
String max2 = maximalValue(cluster); StringTokenizer val = new StringTokenizer(max2); List<String> list = new ArrayList<>(); disambiguateRow(i2, column, chosenKey);
public void fillDown(String value, int column) { checkInvalidColumn(column); for (List<String> list : this) list.set(column, value); }
public static StringGrid fromInput(InputStream from, String sep) throws IOException { List<String> read = IOUtils.readLines(from); if (read.isEmpty()) throw new IllegalStateException("Nothing to read; file is empty"); return new StringGrid(sep, read); }
public StringGrid getRowWithOnlyOneOccurrence(int column) { checkInvalidColumn(column); StringGrid grid = new StringGrid(sep, numColumns); List<String> columns = getColumn(column); Counter<String> counter = new Counter<>(); for (String val : columns) counter.incrementCount(val, 1.0f); Set<String> keys = new HashSet<>(counter.keySet()); for (String key : keys) { if (counter.getCount(key) > 1) { counter.removeKey(key); } } for (List<String> row : this) { for (String key : keys) if (row.get(column).equals(key)) grid.addRow(row); } return grid; }
public List<String> getColumn(int column) { checkInvalidColumn(column); List<String> ret = new ArrayList<>(); for (List<String> list : this) { ret.add(list.get(column)); } return ret; }
public static StringGrid fromFile(String file, String sep) throws IOException { List<String> read = FileUtils.readLines(new File(file)); if (read.isEmpty()) throw new IllegalStateException("Nothing to read; file is empty"); return new StringGrid(sep, read); }
public void merge(int column1, int column2) { checkInvalidColumn(column1); checkInvalidColumn(column2); if (column1 != column2) for (List<String> list : this) { StringBuilder sb = new StringBuilder(); sb.append(list.get(column1)); sb.append(list.get(column2)); list.set(Math.min(column1, column2), sb.toString().replaceAll("\"", "").replace(sep, " ")); list.remove(Math.max(column1, column2)); } numColumns--; }