|
| 1 | +from __future__ import absolute_import, division, print_function |
| 2 | + |
| 3 | +from argparse import ArgumentParser |
| 4 | +from collections import Counter |
| 5 | +from sklearn.metrics.pairwise import cosine_similarity |
| 6 | +import numpy as np |
| 7 | + |
| 8 | + |
| 9 | +def print_nearest_words(args): |
| 10 | + # Load the word vectors |
| 11 | + embeddings_index = {} |
| 12 | + f = open(args.vectors) |
| 13 | + for line in f: |
| 14 | + values = line.split(' ') |
| 15 | + w = values[0] |
| 16 | + coefs = np.asarray(values[1:], dtype='float32') |
| 17 | + embeddings_index[w] = coefs |
| 18 | + f.close() |
| 19 | + |
| 20 | + # Get the similarity scores |
| 21 | + score_dict = {} |
| 22 | + for w in embeddings_index.keys(): |
| 23 | + if args.word == w: |
| 24 | + continue |
| 25 | + |
| 26 | + score = cosine_similarity(embeddings_index[args.word].reshape(1, -1), embeddings_index[w].reshape(1, -1))[0][0] |
| 27 | + score_dict[w] = score |
| 28 | + |
| 29 | + closest = Counter(score_dict).most_common(args.num_words) |
| 30 | + |
| 31 | + for word, score in closest: |
| 32 | + if args.verbose: |
| 33 | + print(score, word) |
| 34 | + else: |
| 35 | + print(word) |
| 36 | + |
| 37 | + |
| 38 | +if __name__ == '__main__': |
| 39 | + parser = ArgumentParser() |
| 40 | + parser.add_argument('--vectors', default='vectors.txt', help='Word vector file') |
| 41 | + parser.add_argument('--vocab', default='vocab.txt', help='Vocab file') |
| 42 | + parser.add_argument('--word', default='dollar', help='Input word') |
| 43 | + parser.add_argument('--verbose', type=bool, default=False, help='Print score') |
| 44 | + parser.add_argument('--num_words', type=int, default=5, help='Number of closest words to print') |
| 45 | + args = parser.parse_args() |
| 46 | + |
| 47 | + print_nearest_words(args) |
0 commit comments