/** * @param dataDirectory the directory of the IMDB review data set * @param wordVectors WordVectors object * @param batchSize Size of each minibatch for training * @param truncateLength If reviews exceed * @param train If true: return the training data. If false: return the testing data. */ public SentimentExampleIterator(String dataDirectory, WordVectors wordVectors, int batchSize, int truncateLength, boolean train) throws IOException { this.batchSize = batchSize; this.vectorSize = wordVectors.getWordVector(wordVectors.vocab().wordAtIndex(0)).length; File p = new File(FilenameUtils.concat(dataDirectory, "aclImdb/" + (train ? "train" : "test") + "/pos/") + "/"); File n = new File(FilenameUtils.concat(dataDirectory, "aclImdb/" + (train ? "train" : "test") + "/neg/") + "/"); positiveFiles = p.listFiles(); negativeFiles = n.listFiles(); this.wordVectors = wordVectors; this.truncateLength = truncateLength; tokenizerFactory = new DefaultTokenizerFactory(); tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor()); }
private Iterator<DataSet> createDataSets(NameSample sample) { TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory(); tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor()); String s = String.join(" ", sample.getSentence()); List<String> tokens = tokenizerFactory.create(s).getTokens(); String[] t = tokens.toArray(new String[tokens.size()]); // sample and t are different tokens at this point due to removing punctuation /*System.out.println("t = " + t.length); System.out.println(String.join(" ", t)); System.out.println("sample = " + sample.getSentence().length); System.out.println(String.join(" ", sample.getSentence())); System.out.println("--------");*/ List<INDArray> features = DeepLearningUtils.mapToFeatureMatrices(wordVectors, t, windowSize); List<INDArray> labels = DeepLearningUtils.mapToLabelVectors(sample, windowSize, this.labels); List<DataSet> dataSetList = new ArrayList<>(); for (int i = 0; i < features.size(); i++) { dataSetList.add(new DataSet(features.get(i), labels.get(i))); } return dataSetList.iterator(); }
@Override public void initialize() { super.initialize(); tokenizerFactory.getBackend().setTokenPreProcessor(tokenPreProcess.getBackend()); initWordVectors(); }
t.setTokenPreProcessor(new CommonPreprocessor());
/** * Constructor with necessary objects to create RNN features. * * @param data Instances with documents and labels * @param wordVectors WordVectors object * @param tokenFact Tokenizer factory * @param tpp Token pre processor * @param stopWords Stop word object * @param batchSize Size of each minibatch for training * @param truncateLength If reviews exceed */ public RnnTextEmbeddingDataSetIterator( Instances data, WordVectors wordVectors, TokenizerFactory tokenFact, TokenPreProcess tpp, AbstractStopwords stopWords, LabeledSentenceProvider sentenceProvider, int batchSize, int truncateLength) { this.batchSize = batchSize; this.wordVectorSize = wordVectors.getWordVector(wordVectors.vocab().wordAtIndex(0)).length; this.data = data; this.wordVectors = wordVectors; this.truncateLength = truncateLength; this.tokenizerFactory = tokenFact; this.tokenizerFactory.getBackend().setTokenPreProcessor(tpp.getBackend()); this.stopWords = stopWords; this.sentenceProvider = sentenceProvider; }
t.setTokenPreProcessor(new CommonPreprocessor());
t.setTokenPreProcessor(preprocessor);
this.tokenizerFactory.getBackend().setTokenPreProcessor(this.preprocessor.getBackend());
this.tokenizerFactory.getBackend().setTokenPreProcessor(this.preprocessor.getBackend());