Truncated SVD (Latent Semantic Analysis)
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
# Create and fit model with TfidfVectorizer
svd = TruncatedSVD(n_components = 4, n_iter=100)
svd.fit(X)
# Get concept words
concept_words = {}
terms = vectorizer.get_feature_names()
for i, comp in enumerate(svd.components_):
componentTerms = zip(terms, comp)
sortedTerms = sorted(componentTerms, key=lambda x: x[1], reverse=True)
sortedTerms = sortedTerms[:10]
concept_words[f'Concept {str(i)}'] = sortedTerms
# Get scored relevance of corpus to concepts
svd_scores = {}
for key in concept_words.keys():
sentence_scores = []
for sentence in dataset:
words = nltk.word_tokenize(sentence)
score = 0
for word in words:
for word_with_score in concept_words[key]:
if word == word_with_score[0]:
score += word_with_score[1]
sentence_scores.append(score)
svd_scores[key] = [score for score in sentence_scores]
concept_map = {'Concept 0': 'Topic 1', 'Concept 1': 'Topic 2', 'Concept 2': 'Topic 3', 'Concept 3': 'Topic 4'}
print(f'\n{key} {concept_map[key]}:')
for i in range(len(sentence_scores)):
print(f'{sentence_scores[i]}: {dataset[i]}')