/** * @param dataDirectory the directory of the IMDB review data set * @param wordVectors WordVectors object * @param batchSize Size of each minibatch for training * @param truncateLength If reviews exceed * @param train If true: return the training data. If false: return the testing data. */ public SentimentExampleIterator(String dataDirectory, WordVectors wordVectors, int batchSize, int truncateLength, boolean train) throws IOException { this.batchSize = batchSize; this.vectorSize = wordVectors.getWordVector(wordVectors.vocab().wordAtIndex(0)).length; File p = new File(FilenameUtils.concat(dataDirectory, "aclImdb/" + (train ? "train" : "test") + "/pos/") + "/"); File n = new File(FilenameUtils.concat(dataDirectory, "aclImdb/" + (train ? "train" : "test") + "/neg/") + "/"); positiveFiles = p.listFiles(); negativeFiles = n.listFiles(); this.wordVectors = wordVectors; this.truncateLength = truncateLength; tokenizerFactory = new DefaultTokenizerFactory(); tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor()); }
private List<String> tokenizeSentence(String sentence) { Tokenizer t = this.tokenizerFactory.create(sentence); ArrayList tokens = new ArrayList(); while(true) { String token; label22: while(true) { if(!t.hasMoreTokens()) { return tokens; } token = t.nextToken(); if(this.wordVectors.hasWord(token)) { break; } } tokens.add(token); } }
IntArrayList labelToMerge = new IntArrayList(); IntArrayList tempIntArrayList = new IntArrayList(); TokenizerFactory tf = new DefaultTokenizerFactory(); WordVectors wv = WordVectorProvider.getWordVectors(config, wordVectorsPath); for (int idx : order) { String text = p.getAllText(); List<String> tokens = tf.create(text).getTokens(); if(tokens.size() < minTokens){ countSkippedOnSize++;
private Iterator<DataSet> createDataSets(NameSample sample) { TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory(); tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor()); String s = String.join(" ", sample.getSentence()); List<String> tokens = tokenizerFactory.create(s).getTokens(); String[] t = tokens.toArray(new String[tokens.size()]); // sample and t are different tokens at this point due to removing punctuation /*System.out.println("t = " + t.length); System.out.println(String.join(" ", t)); System.out.println("sample = " + sample.getSentence().length); System.out.println(String.join(" ", sample.getSentence())); System.out.println("--------");*/ List<INDArray> features = DeepLearningUtils.mapToFeatureMatrices(wordVectors, t, windowSize); List<INDArray> labels = DeepLearningUtils.mapToLabelVectors(sample, windowSize, this.labels); List<DataSet> dataSetList = new ArrayList<>(); for (int i = 0; i < features.size(); i++) { dataSetList.add(new DataSet(features.get(i), labels.get(i))); } return dataSetList.iterator(); }
@Override public void initialize() { super.initialize(); tokenizerFactory.getBackend().setTokenPreProcessor(tokenPreProcess.getBackend()); initWordVectors(); }
TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor());
/** * Constructor with necessary objects to create RNN features. * * @param data Instances with documents and labels * @param wordVectors WordVectors object * @param tokenFact Tokenizer factory * @param tpp Token pre processor * @param stopWords Stop word object * @param batchSize Size of each minibatch for training * @param truncateLength If reviews exceed */ public RnnTextEmbeddingDataSetIterator( Instances data, WordVectors wordVectors, TokenizerFactory tokenFact, TokenPreProcess tpp, AbstractStopwords stopWords, LabeledSentenceProvider sentenceProvider, int batchSize, int truncateLength) { this.batchSize = batchSize; this.wordVectorSize = wordVectors.getWordVector(wordVectors.vocab().wordAtIndex(0)).length; this.data = data; this.wordVectors = wordVectors; this.truncateLength = truncateLength; this.tokenizerFactory = tokenFact; this.tokenizerFactory.getBackend().setTokenPreProcessor(tpp.getBackend()); this.stopWords = stopWords; this.sentenceProvider = sentenceProvider; }
/** * Used post training to convert a String to a features INDArray that can be passed to the network output method * * @param reviewContents Contents of the review to vectorize * @param maxLength Maximum length (if review is longer than this: truncate to maxLength). Use Integer.MAX_VALUE to not nruncate * @return Features array for the given input String */ public INDArray loadFeaturesFromString(String reviewContents, int maxLength){ List<String> tokens = tokenizerFactory.create(reviewContents).getTokens(); List<String> tokensFiltered = new ArrayList<>(); for(String t : tokens ){ if(wordVectors.hasWord(t)) tokensFiltered.add(t); } int outputLength = Math.max(maxLength,tokensFiltered.size()); INDArray features = Nd4j.create(1, vectorSize, outputLength); for( int j=0; j<tokens.size() && j<maxLength; j++ ){ String token = tokens.get(j); INDArray vector = wordVectors.getWordVectorMatrix(token); features.put(new INDArrayIndex[]{NDArrayIndex.point(0), NDArrayIndex.all(), NDArrayIndex.point(j)}, vector); } return features; } }
TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor());
this.tokenizerFactory.getBackend().setTokenPreProcessor(this.preprocessor.getBackend());
int maxLength = 0; for(String s : reviews){ List<String> tokens = tokenizerFactory.create(s).getTokens(); List<String> tokensFiltered = new ArrayList<>(); for(String t : tokens ){
TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(preprocessor);
this.tokenizerFactory.getBackend().setTokenPreProcessor(this.preprocessor.getBackend());
/** * Tokenize single sentence. Uses {@link RnnTextEmbeddingDataSetIterator#tokenizerFactory} to * create the tokens and filters based on whether the * {@link RnnTextEmbeddingDataSetIterator#wordVectors} model contains the token and further * filters based on the given {@link RnnTextEmbeddingDataSetIterator#stopWords}. * * @param sentence Sentence to be tokenized * @return Tokenized sentence */ protected List<String> tokenizeSingleSentence(String sentence) { return tokenizerFactory .getBackend() .create(sentence) .getTokens() .stream() .filter(wordVectors::hasWord) .filter(t -> !stopWords.isStopword(t)) .collect(Collectors.toList()); }
List<String> words = this.tokenizerFactory.getBackend().create(content).getTokens();