import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.datasets import fetch_20newsgroups
# --- 1. Load and Prepare Data ---
def load_and_prepare_data(n_samples=500, n_features=1000):
print("Loading 20 newsgroups dataset...")
# Using a subset for quicker demonstration
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
dataset = fetch_20newsgroups(subset='all', categories=categories,
shuffle=True, random_state=42,
remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print(f"Loaded {len(data_samples)} samples.")
return data_samples
# --- 2. Vectorize Text Data ---
def vectorize_text(data_samples, max_df=0.95, min_df=2, n_features=1000, use_tfidf=False):
print("Vectorizing text data...")
if use_tfidf:
vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df,
max_features=n_features, stop_words='english')
else:
vectorizer = CountVectorizer(max_df=max_df, min_df=min_df,
max_features=n_features, stop_words='english')
dtm = vectorizer.fit_transform(data_samples)
feature_names = vectorizer.get_feature_names_out()
return dtm, feature_names
# --- 3. Display Top Words for Topics ---
def display_topics(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
message = f"Topic #{topic_idx}: "
message += " ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]])
print(message)
print()
# --- Main Execution ---
if __name__ == '__main__': # To prevent execution when imported
n_samples = 500 # Number of documents to use
n_features = 1000 # Number of words (features) to keep
n_components = 5 # Number of topics to find
n_top_words = 10 # Number of top words to display per topic
# Load data
data_samples = load_and_prepare_data(n_samples=n_samples)
# Create Document-Term Matrix (using CountVectorizer for LDA as per sklearn docs)
dtm_cv, feature_names_cv = vectorize_text(data_samples, n_features=n_features, use_tfidf=False)
# Create Document-Term Matrix (using TfidfVectorizer for NMF as it often works well)
dtm_tfidf, feature_names_tfidf = vectorize_text(data_samples, n_features=n_features, use_tfidf=True)
# --- Latent Dirichlet Allocation (LDA) ---
print("Fitting LDA model with CountVectorizer features...")
lda = LatentDirichletAllocation(n_components=n_components, max_iter=10, # max_iter for speed
learning_method='online',
learning_offset=50.,
random_state=42)
lda.fit(dtm_cv)
print("\nTopics found by LDA:")
display_topics(lda, feature_names_cv, n_top_words)
# --- Non-Negative Matrix Factorization (NMF) ---
print("Fitting NMF model with TF-IDF features...")
# NMF often benefits from TF-IDF weighting
nmf = NMF(n_components=n_components, random_state=42,
alpha_W=0.00005, alpha_H=0.00005, # L1 regularization, use alpha_W and alpha_H instead of alpha
l1_ratio=1, # Use l1_ratio for L1, set to 1 for pure L1
max_iter=300) # Increased max_iter for NMF
nmf.fit(dtm_tfidf)
print("\nTopics found by NMF:")
display_topics(nmf, feature_names_tfidf, n_top_words)
# To get topic distribution for a new document:
# new_doc_text = ["New document about space exploration and astronaut missions."]
# new_doc_cv = CountVectorizer(vocabulary=feature_names_cv).transform(new_doc_text)
# topic_dist_lda = lda.transform(new_doc_cv)
# print("\nLDA Topic distribution for new doc:", topic_dist_lda)
# new_doc_tfidf = TfidfVectorizer(vocabulary=feature_names_tfidf).transform(new_doc_text)
# topic_dist_nmf = nmf.transform(new_doc_tfidf)
# print("NMF Topic distribution for new doc:", topic_dist_nmf)
# Note: The print statements from the if __name__ == '__main__': block
# will run if this file is executed directly.
# In the context of this knowledge base, the get_content() function is primary.
# The example code within the string is for display and understanding.
# For actual execution in a separate environment, ensure libraries are installed
# and consider removing the if __name__ == '__main__': guard if preferred.