public void compress(final ListDataSet dataSet) { new PFor(0, dataSet.size() - 1) { @Override public void step(int i) { Sample sample = dataSet.get(i); compress(sample); } }; }
public final List<ListDataSet> splitByClass() { List<ListDataSet> returnDataSets = new ArrayList<ListDataSet>(); for (int i = 0; i < getClassCount(); i++) { ListDataSet ds = DataSet.Factory.labeledDataSet("Class " + i); for (Sample s : this) { if (s.getTargetClass() == i) { ds.add(s.clone()); } } returnDataSets.add(ds); } return returnDataSets; }
@Override public void step(int i) { Sample sample = dataSet.get(i); compress(sample); } };
public final List<ListDataSet> splitByCount(boolean shuffle, int... count) { List<ListDataSet> dataSets = new ArrayList<ListDataSet>(); List<Sample> all = new FastArrayList<Sample>(); all.addAll(this); for (int i = 0; i < count.length; i++) { ListDataSet ds = DataSet.Factory.labeledDataSet("DataSet" + i); for (int c = 0; c < count[i]; c++) { if (shuffle) { ds.add(all.remove(MathUtil.nextInteger(all.size()))); } else { ds.add(all.remove(0)); } } dataSets.add(ds); } ListDataSet ds = DataSet.Factory.labeledDataSet("DataSet" + count.length); ds.addAll(all); dataSets.add(ds); return dataSets; }
public DataSetToInstancesWrapper(ListDataSet dataSet, boolean discrete, boolean includeTarget) { super(dataSet.getLabel(), new DataSetToAttributeInfoWrapper(dataSet, discrete), dataSet .size()); if (includeTarget) { setClassIndex(dataSet.getFeatureCount()); } for (Sample s : dataSet) { Matrix input = s.getAsMatrix(INPUT); Matrix weight = s.getAsMatrix(WEIGHT); Matrix target = s.getAsMatrix(TARGET); add(new SampleToInstanceWrapper(input, weight, target, discrete, includeTarget)); } }
public ListDataSet calculate(ListDataSet dataSet) throws Exception { product1ToIds.setLabel("Product 1 Ids"); product2ToIds.setLabel("Product 2 Ids"); Matrix product1Count = new CountMatrix(product1ToIds); product1Count.setLabel("Product 1 Count"); Matrix product2Count = new CountMatrix(product2ToIds); product2Count.setLabel("Product 2 Count"); for (int r = 0; r < dataSet.size(); r++) { if (r % 1000 == 0) { System.out.println(r + " of " + dataSet.size()); } RelationalSample s = (RelationalSample) dataSet.get(r); Collection<?> products = s.getObjects(); if (products.size() != 0) { addProduct1Count(products, r); addProduct2Count(products, r); } } return calculateP(minSupport); }
new PFor(0, dataSet.size() - 1) { if (dataSet.get(0).getAsMatrix(getTargetLabel()) != null) { final Matrix confusion; double error = 0.0; Matrix rmse = Matrix.Factory.linkToValue(Math.sqrt(error / dataSet.size())); rmse.setLabel("RMSE with " + getLabel()); dataSet.setMatrix(Variable.RMSE, rmse); dataSet.setMatrix(Variable.CONFUSION, confusion); / (double) dataSet.size()); accuracy.setLabel("Accuracy with " + getLabel()); dataSet.setMatrix(Variable.ACCURACY, accuracy); dataSet.setMatrix(Variable.ERRORCOUNT, errorMatrix); dataSet.setMatrix(Variable.SENSITIVITY, sensitivity); dataSet.setMatrix(Variable.SPECIFICITY, specificity); dataSet.setMatrix(Variable.PRECISION, precision); dataSet.setMatrix(Variable.RECALL, recall); dataSet.setMatrix(Variable.FMEASURE, fmeasure); dataSet.setMatrix(Variable.FMEASUREMACRO, fmeasureMacro);
@Test public void testTagger() throws Exception { if (tagger == null) { return; } ListDataSet ds = new DefaultListDataSet(); Sample sa1 = new DefaultSample(); sa1.put(Sample.INPUT, s1); Sample sa2 = new DefaultSample(); sa2.put(Sample.INPUT, s2); ds.add(sa1); ds.add(sa2); tokenizer.tokenize(Sample.INPUT, ds); tagger.tag(ds); sa1 = ds.get(0); sa2 = ds.get(1); Matrix m1 = sa1.getAsMatrix(Tagger.TAGGED); Matrix m2 = sa2.getAsMatrix(Tagger.TAGGED); assertEquals(2, m1.getColumnCount()); assertEquals(11, m1.getRowCount()); assertEquals(2, m2.getColumnCount()); assertEquals(5, m2.getRowCount()); } }
iris.setLabel("Iris flower data set"); iris.setMetaData(Sample.URL, "http://archive.ics.uci.edu/ml/datasets/Iris"); iris.setDescription("Fisher's Iris data set is a multivariate data set introduced by Sir Ronald Aylmer Fisher (1936) as an example of discriminant analysis."); s0.setLabel("Iris-setosa"); s0.setId("iris-0"); iris.add(s0); s1.setLabel("Iris-setosa"); s1.setId("iris-1"); iris.add(s1); s2.setLabel("Iris-setosa"); s2.setId("iris-2"); iris.add(s2); s3.setLabel("Iris-setosa"); s3.setId("iris-3"); iris.add(s3); s4.setLabel("Iris-setosa"); s4.setId("iris-4"); iris.add(s4); s5.setLabel("Iris-setosa"); s5.setId("iris-5"); iris.add(s5); s6.setLabel("Iris-setosa");
public void trainAll(ListDataSet dataSet) { reset(); classCount = getClassCount(dataSet); for (int i = 0; i < classCount; i++) { System.out.println("Training class " + i); Classifier c = singleClassClassifier.emptyCopy(); singleClassClassifiers.add(c); Matrix input = dataSet.getInputMatrix(); Matrix target = dataSet.getTargetMatrix().selectColumns(Ret.LINK, i); if (twoColumns) { Matrix target2 = target.minus(1).abs(Ret.NEW); target = Matrix.Factory.horCat(target, target2); } ListDataSet ds = DataSet.Factory.linkToInputAndTarget(input, target); c.trainAll(ds); } }
@Test public void testMLP() throws Exception { ListDataSet iris = ListDataSet.Factory.IRIS(); iris.getInputMatrix().standardize(Ret.ORIG, Matrix.ROW); MultiLayerNetwork mlp = new MultiLayerNetwork(10); mlp.setLearningRate(0.05); for (int i = 0; i < 300; i++) { mlp.trainOnce(iris); } mlp.predictAll(iris); assertEquals(0.90, iris.getAccuracy(), 0.2); } }
public static void main(String[] args) throws Exception { Matrix data = Matrix.Factory.linkTo().file("/home/arndt/muenchen/totale2.txt").asDenseCSV(); // data.showGUI(); ListDataSet orig = new DefaultListDataSet(); // for (int r = 0; r < 10000; r++) { for (int r = 0; r < data.getRowCount(); r++) { if (r % 1000 == 0) { System.out.println(r + " of " + data.getRowCount()); } Collection<?> products = getProductsInLine(data, r); if (products.size() != 0) { RelationalSample s = Sample.Factory.relationalSample(products); orig.add(s); } } MarketBasketAnalysis mba = new MarketBasketAnalysis(); orig.showGUI(); ListDataSet ds = mba.calculate(orig); ds.showGUI(); }
public String getLabel() { return dataSet.getLabel(); }
public synchronized ListDataSet search(Query query, int start, int count) throws Exception { System.out.println("searching for: " + query); TopDocs top = getIndexSearcher().search(query, 100); MoreLikeThis mlt = new MoreLikeThis(getIndexSearcher().getIndexReader()); mlt.setFieldNames(new String[] { Variable.LABEL, Variable.DESCRIPTION, Variable.TAGS }); mlt.setMaxWordLen(MAXWORDLENGTH); TopDocs td = indexSearcher.search(query, count); ListDataSet result = new DefaultListDataSet(); result.setMetaData("Total", td.totalHits); for (ScoreDoc sd : td.scoreDocs) { int id = sd.doc; Document doc = indexSearcher.doc(id); Sample s = null; s = (Sample) SerializationUtil.deserialize(doc.getBinaryValue("RawData").bytes); s.put(Sample.SCORE, MathUtil.getMatrix(sd.score)); String[] terms = mlt.retrieveInterestingTerms(id); for (int i = 0; i < 10 && i < terms.length; i++) { s.put(Variable.SUGGESTEDTAGS, terms[i]); } result.add(s); } return result; }
public static void main(String[] args) { // load example data set ListDataSet dataSet = DataSet.Factory.IRIS(); // create a classifier LinearRegression classifier = new LinearRegression(); // train the classifier using all data classifier.trainAll(dataSet); // use the classifier to make predictions classifier.predictAll(dataSet); // get the results double accurary = dataSet.getAccuracy(); System.out.println("accuracy: " + accurary); }
Matrix valueCounts = dataSet.getInputMatrix().max(Ret.NEW, Matrix.ROW).plus(1); for (int j = 0; j < dataSet.getInputMatrix().getColumnCount(); j++) { weka.core.Attribute a = null; if (discrete) { int classCount = dataSet.getClassCount(); for (int i = 0; i < classCount; i++) { classes.addElement("Class " + i);
List<ListDataSet> dss = dataSet.splitForCV(folds, fold, randomSeed + run); ListDataSet train = dss.get(0); ListDataSet test = dss.get(1); algorithm.predictAll(test); acc.add(test.getAccuracy()); fm.add(test.getAsDouble(Variable.FMEASUREMACRO)); sens.add(test.getAsDouble(Variable.SENSITIVITY)); spec.add(test.getAsDouble(Variable.SPECIFICITY)); prec.add(test.getAsDouble(Variable.PRECISION)); rec.add(test.getAsDouble(Variable.RECALL)); rmse.add(test.getAsDouble(Variable.RMSE)); System.out.print(test.getAsDouble(Variable.FMEASUREMACRO) + "\t");
public void train(ListDataSet dataSet) { final int featureCount = dataSet.getFeatureCount(); randomVectors = Matrix.Factory.randn(featureCount, numberOfBits); }
private Matrix createCompleteMatrix(ListDataSet dataSet) { final int sampleCount = dataSet.size(); final int featureCount = getFeatureCount(dataSet); final int targetCount = getClassCount(dataSet); Matrix m = Matrix.Factory.zeros(sampleCount, featureCount + targetCount); for (int r = 0; r < sampleCount; r++) { Sample s = dataSet.get(r); Matrix input = s.getAsMatrix(getInputLabel()).toColumnVector(Ret.NEW); Matrix target = s.getAsMatrix(getTargetLabel()).toColumnVector(Ret.NEW); for (int c = 0; c < featureCount; c++) { m.setAsDouble(input.getAsDouble(0, c), r, c); } for (int c = 0; c < targetCount; c++) { m.setAsDouble(target.getAsDouble(0, c), r, c + featureCount); } } return m; }