import plospy import os from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import linear_kernel import itertools all_names = [name for name in os.listdir('../plos/plos_biology/plos_biology_data') if '.dat' in name] all_names[0:10] print len(all_names) ids = [] titles = [] def get_corpus(all_names): for name_i, name in enumerate(all_names): docs = plospy.PlosXml('../plos/plos_biology/plos_biology_data/'+name) for article in docs.docs: ids.append(article['id']) titles.append(article['title']) yield article['body'] corpus = get_corpus(all_names) tfidf = TfidfVectorizer().fit_transform(corpus) len(ids) tfidf.shape top_five = [[-1,-1,-1] for i in range(5)] threshold = -1. for index in range(len(ids)): cosine_similarities = linear_kernel(tfidf[index:index+1], tfidf).flatten() related_docs_indices = cosine_similarities.argsort()[:-5:-1] first = related_docs_indices[0] second = related_docs_indices[1] if first != index: print 'Error' break if cosine_similarities[second] > threshold: if first not in [top[0] for top in top_five] and first not in [top[1] for top in top_five]: scores = [top[2] for top in top_five] replace = scores.index(min(scores)) # print 'replace',replace top_five[replace] = [first, second, cosine_similarities[second]] # print 'old threshold',threshold threshold = min(scores) # print 'new threshold',threshold for tf in top_five: print '' print('Cosine Similarity: %.2f' % tf[2]) print('Title 1: %s' %titles[tf[0]]) print('http://www.plosbiology.org/article/info%3Adoi%2F'+str(ids[tf[0]])) print '' print('Title 2: %s' %titles[tf[1]]) print('http://www.plosbiology.org/article/info%3Adoi%2F'+str(ids[tf[1]])) print ''