private List<String> tokenizeSentence(String sentence) { Tokenizer t = this.tokenizerFactory.create(sentence); ArrayList tokens = new ArrayList(); while(true) { String token; label22: while(true) { if(!t.hasMoreTokens()) { return tokens; } token = t.nextToken(); if(this.wordVectors.hasWord(token)) { break; } } tokens.add(token); } }
/** * Used post training to convert a String to a features INDArray that can be passed to the network output method * * @param reviewContents Contents of the review to vectorize * @param maxLength Maximum length (if review is longer than this: truncate to maxLength). Use Integer.MAX_VALUE to not nruncate * @return Features array for the given input String */ public INDArray loadFeaturesFromString(String reviewContents, int maxLength){ List<String> tokens = tokenizerFactory.create(reviewContents).getTokens(); List<String> tokensFiltered = new ArrayList<>(); for(String t : tokens ){ if(wordVectors.hasWord(t)) tokensFiltered.add(t); } int outputLength = Math.max(maxLength,tokensFiltered.size()); INDArray features = Nd4j.create(1, vectorSize, outputLength); for( int j=0; j<tokens.size() && j<maxLength; j++ ){ String token = tokens.get(j); INDArray vector = wordVectors.getWordVectorMatrix(token); features.put(new INDArrayIndex[]{NDArrayIndex.point(0), NDArrayIndex.all(), NDArrayIndex.point(j)}, vector); } return features; } }
int maxLength = 0; for(String s : reviews){ List<String> tokens = tokenizerFactory.create(s).getTokens(); List<String> tokensFiltered = new ArrayList<>(); for(String t : tokens ){
String text = p.getAllText(); List<String> tokens = tf.create(text).getTokens(); if(tokens.size() < minTokens){ countSkippedOnSize++;
/** * Tokenize single sentence. Uses {@link RnnTextEmbeddingDataSetIterator#tokenizerFactory} to * create the tokens and filters based on whether the * {@link RnnTextEmbeddingDataSetIterator#wordVectors} model contains the token and further * filters based on the given {@link RnnTextEmbeddingDataSetIterator#stopWords}. * * @param sentence Sentence to be tokenized * @return Tokenized sentence */ protected List<String> tokenizeSingleSentence(String sentence) { return tokenizerFactory .getBackend() .create(sentence) .getTokens() .stream() .filter(wordVectors::hasWord) .filter(t -> !stopWords.isStopword(t)) .collect(Collectors.toList()); }
private Iterator<DataSet> createDataSets(NameSample sample) { TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory(); tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor()); String s = String.join(" ", sample.getSentence()); List<String> tokens = tokenizerFactory.create(s).getTokens(); String[] t = tokens.toArray(new String[tokens.size()]); // sample and t are different tokens at this point due to removing punctuation /*System.out.println("t = " + t.length); System.out.println(String.join(" ", t)); System.out.println("sample = " + sample.getSentence().length); System.out.println(String.join(" ", sample.getSentence())); System.out.println("--------");*/ List<INDArray> features = DeepLearningUtils.mapToFeatureMatrices(wordVectors, t, windowSize); List<INDArray> labels = DeepLearningUtils.mapToLabelVectors(sample, windowSize, this.labels); List<DataSet> dataSetList = new ArrayList<>(); for (int i = 0; i < features.size(); i++) { dataSetList.add(new DataSet(features.get(i), labels.get(i))); } return dataSetList.iterator(); }
List<String> words = this.tokenizerFactory.getBackend().create(content).getTokens();