/** * @param dataDirectory the directory of the IMDB review data set * @param wordVectors WordVectors object * @param batchSize Size of each minibatch for training * @param truncateLength If reviews exceed * @param train If true: return the training data. If false: return the testing data. */ public SentimentExampleIterator(String dataDirectory, WordVectors wordVectors, int batchSize, int truncateLength, boolean train) throws IOException { this.batchSize = batchSize; this.vectorSize = wordVectors.getWordVector(wordVectors.vocab().wordAtIndex(0)).length; File p = new File(FilenameUtils.concat(dataDirectory, "aclImdb/" + (train ? "train" : "test") + "/pos/") + "/"); File n = new File(FilenameUtils.concat(dataDirectory, "aclImdb/" + (train ? "train" : "test") + "/neg/") + "/"); positiveFiles = p.listFiles(); negativeFiles = n.listFiles(); this.wordVectors = wordVectors; this.truncateLength = truncateLength; tokenizerFactory = new DefaultTokenizerFactory(); tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor()); }
IntArrayList labelToMerge = new IntArrayList(); IntArrayList tempIntArrayList = new IntArrayList(); TokenizerFactory tf = new DefaultTokenizerFactory(); WordVectors wv = WordVectorProvider.getWordVectors(config, wordVectorsPath); for (int idx : order) {
private Iterator<DataSet> createDataSets(NameSample sample) { TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory(); tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor()); String s = String.join(" ", sample.getSentence()); List<String> tokens = tokenizerFactory.create(s).getTokens(); String[] t = tokens.toArray(new String[tokens.size()]); // sample and t are different tokens at this point due to removing punctuation /*System.out.println("t = " + t.length); System.out.println(String.join(" ", t)); System.out.println("sample = " + sample.getSentence().length); System.out.println(String.join(" ", sample.getSentence())); System.out.println("--------");*/ List<INDArray> features = DeepLearningUtils.mapToFeatureMatrices(wordVectors, t, windowSize); List<INDArray> labels = DeepLearningUtils.mapToLabelVectors(sample, windowSize, this.labels); List<DataSet> dataSetList = new ArrayList<>(); for (int i = 0; i < features.size(); i++) { dataSetList.add(new DataSet(features.get(i), labels.get(i))); } return dataSetList.iterator(); }
TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor());
TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor());
TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(preprocessor);