public static void main(String[] args){ W2VDistanceMeasurer vw2v = W2VDistanceMeasurer.getInstance(); double value = vw2v.vec.similarity("product", "item"); System.out.println(value); }
vec = new org.deeplearning4j.models.word2vec.Word2Vec.Builder() .batchSize(batchSize) //# words per minibatch. .windowSize(windowSize) .minWordFrequency(minWordFrequency) // .useAdaGrad(false) // .layerSize(layerSize) // word feature vector size .seed(42) .iterations(iterations) // # iterations to train .epochs(epochs) .stopWords(stopWords) .learningRate(0.025) // .minLearningRate(0.001) // learning rate decays wrt # words. floor learning .negativeSample(10) // sample size 10 words .iterate(iter) // .tokenizerFactory(t) .build(); ((org.deeplearning4j.models.word2vec.Word2Vec) vec).fit();
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1) .layerSize(100).seed(42).windowSize(5).iterate(iter) .tokenizerFactory(t).build(); vec.fit(); Collection<String> lst = vec.wordsNearest("french", 10); System.out.println(lst);
new Word2Vec.Builder() .minWordFrequency(this.minWordFrequency) .useAdaGrad(this.useAdaGrad) .allowParallelTokenization(this.allowParallelTokenization) .enableScavenger(this.enableScavenger) .negativeSample(this.negativeSamplingValue) .sampling(this.subSamplingThres) .epochs(this.epochs) .learningRate(this.learningRate) .minLearningRate(this.minLearningRate) .workers(this.workers) .iterations(this.iterations) .layerSize(this.layerSize) .seed(this.seed) .windowSize(this.windowSize) .iterate(iter) .stopWords(this.stopWordsHandler.getStopList()) .tokenizerFactory(this.tokenizerFactory.getBackend()) .build();
t.setTokenPreProcessor(new CommonPreprocessor()); InMemoryLookupCache cache = new InMemoryLookupCache(); WeightLookupTable table = new InMemoryLookupTable.Builder() .vectorLength(100) Word2Vec vec = new Word2Vec.Builder() .minWordFrequency(5).iterations(1) .layerSize(100).lookupTable(table) .stopWords(new ArrayList<String>()) .vocabCache(cache).seed(42) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); vec.fit(); } catch (IOException e) { Collection<String> lst = vec.wordsNearest("day", 10); System.out.println(lst);
AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build(); List<INDArray> arrays = new ArrayList<>(); int words = 0; String word = reader.readUTF(); INDArray row = Nd4j.read(reader); VocabWord word1 = new VocabWord(1.0, word); word1.setIndex(cache.numWords()); cache.addToken(word1); cache.addWordToIndex(word1.getIndex(), word); cache.putVocabWord(word); arrays.add(row); words++;
@Override public Instances getDataSet() throws IOException { if (m_sourceFile == null) { throw new IOException("No source has been specified"); } if (getRetrieval() == INCREMENTAL) { throw new IOException("This loader cannot load instances incrementally."); } setRetrieval(BATCH); if (m_structure == null) { getStructure(); } Instances result = new Instances(m_structure); for (String word : vec.getVocab().words()) { double[] values = new double[result.numAttributes()]; for (int i = 0; i < this.vec.getWordVector(word).length; i++) values[i] = this.vec.getWordVector(word)[i]; values[result.numAttributes() - 1] = result.attribute("word_id").addStringValue(word); Instance inst = new DenseInstance(1, values); inst.setDataset(result); result.add(inst); } return result; }
if (vec.hasWord(curTokenText)) curVec = vec.getWordVectorMatrix(curTokenText); } else { if (vec.hasWord(curTokenText)) curVec = curVec.add(vec.getWordVectorMatrix(curTokenText));
public void setStructure() { ArrayList<Attribute> att = new ArrayList<Attribute>(); // Add one attribute for each embedding dimension for (int i = 0; i < this.vec.getLayerSize(); i++) { att.add(new Attribute("embedding-" + i)); } att.add(new Attribute("word_id", (ArrayList<String>) null)); m_structure = new Instances("W2V model loaded from " + this.m_File.toString(), att, 0); }
double value = w2v.vec.similarity(lemma1, lemma2); results.add(w2vPrefix+new Float(value).toString());
sim = 1.0; else sim = vec.similarity(str1, str2);