mport pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.metrics import silhouette_score
# Wczytanie danych z pliku tekstowego
with open('artykuly.txt', 'r', encoding='utf-8') as file:
data = file.readlines()
# Przetwarzanie tekstu
stop_words = set(stopwords.words('polish'))
stemmer = SnowballStemmer('polish')
processed_data = []
for article in data:
tokens = word_tokenize(article.lower(), language='polish') # Tokenizacja
filtered_tokens = [stemmer.stem(token) for token in tokens if token.isalnum() and token not in stop_words] # Usunięcie stopwords i stemming
processed_data.append(' '.join(filtered_tokens))
# Wektoryzacja tekstu
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(processed_data)
# Modelowanie tematyczne za pomocą LDA
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)
# Przyporządkowanie dokumentów do klastrów
clusters = lda.transform(X)
# Wybór liczby klastrów za pomocą metody łokcia
scores = []
for k in range(2, 10):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(clusters)
scores.append(silhouette_score(clusters, kmeans.labels_))
optimal_k = scores.index(max(scores)) + 2
# Klasteryzacja za pomocą k-means
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans.fit(clusters)
cluster_labels = kmeans.labels_
# Wyświetlenie wyników
result_df = pd.DataFrame({'Article': data, 'Cluster': cluster_labels})
print(result_df)
{"html5":"htmlmixed","css":"css","javascript":"javascript","php":"php","python":"python","ruby":"ruby","lua":"text\/x-lua","bash":"text\/x-sh","go":"go","c":"text\/x-csrc","cpp":"text\/x-c++src","diff":"diff","latex":"stex","sql":"sql","xml":"xml","apl":"apl","asterisk":"asterisk","c_loadrunner":"text\/x-csrc","c_mac":"text\/x-csrc","coffeescript":"text\/x-coffeescript","csharp":"text\/x-csharp","d":"d","ecmascript":"javascript","erlang":"erlang","groovy":"text\/x-groovy","haskell":"text\/x-haskell","haxe":"text\/x-haxe","html4strict":"htmlmixed","java":"text\/x-java","java5":"text\/x-java","jquery":"javascript","mirc":"mirc","mysql":"sql","ocaml":"text\/x-ocaml","pascal":"text\/x-pascal","perl":"perl","perl6":"perl","plsql":"sql","properties":"text\/x-properties","q":"text\/x-q","scala":"scala","scheme":"text\/x-scheme","tcl":"text\/x-tcl","vb":"text\/x-vb","verilog":"text\/x-verilog","yaml":"text\/x-yaml","z80":"text\/x-z80"}