하이브리드 버전: k-군집화+ 코사인 유사도
import pandas as pd
from gensim.models import Word2Vec
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
def record_to_vector(model, row):
ingredients_list = re.split(',\s*', row['재료'])
ingredients_vector_list = [model.wv[ingredient] for ingredient in ingredients_list if ingredient in model.wv]
if ingredients_vector_list:
ingredients_vector = np.mean(ingredients_vector_list, axis=0)
else:
ingredients_vector = np.zeros(model.vector_size)
category_value = row['카테고리'].replace(" ", "_")
category_vector = model.wv[category_value]
return ingredients_vector, category_vector
new_data = {
'음식명': ['짜장면', '짬뽕', '볶음밥', '된장찌개', '된장라면', '해물라면', '김치찌개', '떡볶이', '쌀국수', '피자', '스프링롤'],
'재료': ['면, 소스', '해물, 면', '밥, 재료', '된장, 야채', '된장, 면', '해물, 면', '김치, 두부', '떡, 고추장', '면, 야채', '밀가루, 토마토 소스', '밀가루, 야채'],
'카테고리': ['중식', '중식', '한식', '한식', '한식', '한식', '한식', '한식', '아시아', '양식', '아시아']
}
df = pd.DataFrame(new_data)
tokenized_data = [re.split(',\s*', recipe) + [category] for recipe, category in zip(df['재료'], df['카테고리'])]
model = Word2Vec(sentences=tokenized_data, vector_size=100, window=5, min_count=1, workers=4)
vectors = [record_to_vector(model, row) for _, row in df.iterrows()]
ingredients_vectors, category_vectors = zip(*vectors)
ingredients_vectors = np.array(ingredients_vectors)
category_vectors = np.array(category_vectors)
combined_vectors = np.hstack([ingredients_vectors, category_vectors])
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(combined_vectors)
cluster_centers = kmeans.cluster_centers_
cluster_similarity_matrix = cosine_similarity(cluster_centers, combined_vectors)
target_cluster_index = 0
n_recommendations = 5
cluster_similar_indices = np.argsort(cluster_similarity_matrix[target_cluster_index])[::-1][:n_recommendations]
print(f"Recommendation results (items closest to cluster {target_cluster_index}):")
for i, idx in enumerate(cluster_similar_indices):
print(f"{i+1}. Recipe {idx}, Cosine Similarity: {cluster_similarity_matrix[target_cluster_index, idx]}")
tsne = TSNE(n_components=2, random_state=42, perplexity=3)
embedded_vectors = tsne.fit_transform(combined_vectors)
fig, ax = plt.subplots()
ax.scatter(embedded_vectors[:, 0], embedded_vectors[:, 1], c=df['cluster'], label='Ingredients')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.legend()
plt.show()
fig, ax = plt.subplots()
ax.scatter(embedded_vectors[:, 0], embedded_vectors[:, 1], c=df['cluster'], label='Ingredients')
ax.scatter(embedded_vectors[cluster_similar_indices, 0], embedded_vectors[cluster_similar_indices, 1], color='red', label='Recommendations')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.legend()
plt.show()

응용-> 전체 데이터를 학습시켜서 군집을 여러개 분류해놓도록 한 뒤, test데이터(찜 목록 내 레시피라 가정)가 들어오면 해당 데이터가 어느 군집에 속하는지 계산하고 이후 해당되는(속하는) 군집의 레시피들을 추천해주는 시스템
import pandas as pd
from gensim.models import Word2Vec
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from google.colab import drive
def record_to_vector(model, row):
ingredients_list = re.split(',\s*', row['RCP_PARTS_DTLS'])
ingredients_vector_list = [model.wv[ingredient] for ingredient in ingredients_list if ingredient in model.wv]
if ingredients_vector_list:
ingredients_vector = np.mean(ingredients_vector_list, axis=0)
else:
ingredients_vector = np.zeros(model.vector_size)
category_value = row['RCP_PAT2'].replace(" ", "_")
if category_value in model.wv:
category_vector = model.wv[category_value]
else:
category_vector = np.zeros(model.vector_size)
return ingredients_vector, category_vector
drive.mount('/content/drive')
df_train = pd.read_csv(csv_link)
df_train = df_train[df_train['RCP_PARTS_DTLS'].apply(lambda x: isinstance(x, str)) & df_train['RCP_PAT2'].apply(lambda x: isinstance(x, str))]
tokenized_train_data = [re.split(',\s*', recipe) + [category] for recipe, category in zip(df_train['RCP_PARTS_DTLS'], df_train['RCP_PAT2'])]
model = Word2Vec(sentences=tokenized_train_data, vector_size=100, window=5, min_count=1, workers=4)
vectors_train = [record_to_vector(model, row) for _, row in df_train.iterrows()]
ingredients_vectors_train, category_vectors_train = zip(*vectors_train)
ingredients_vectors_train = np.array(ingredients_vectors_train)
category_vectors_train = np.array(category_vectors_train)
combined_vectors_train = np.hstack([ingredients_vectors_train, category_vectors_train])
kmeans = KMeans(n_clusters=60, random_state=42)
df_train['cluster'] = kmeans.fit_predict(combined_vectors_train)
cluster_centers_train = kmeans.cluster_centers_
df_test = pd.read_csv(csv_link_test)
df_test = df_test[df_test['RCP_PARTS_DTLS'].apply(lambda x: isinstance(x, str)) & df_test['RCP_PAT2'].apply(lambda x: isinstance(x, str))]
vectors_test = [record_to_vector(model, row) for _, row in df_test.iterrows()]
ingredients_vectors_test, category_vectors_test = zip(*vectors_test)
ingredients_vectors_test = np.array(ingredients_vectors_test)
category_vectors_test = np.array(category_vectors_test)
combined_vectors_test = np.hstack([ingredients_vectors_test, category_vectors_test])
cluster_similarity_matrix_test = cosine_similarity(cluster_centers_train, combined_vectors_test)
target_cluster_indices_test = np.argmax(cluster_similarity_matrix_test, axis=0)
for test_index, cluster_index in enumerate(target_cluster_indices_test):
print(f"Test Recipe {test_index + 1} belongs to Cluster {cluster_index}:")
cluster_recipes = df_train[df_train['cluster'] == cluster_index]['RCP_NM']
print(cluster_recipes)
print("\n")
n_recommendations_test = 5
cluster_similar_indices_test = np.argsort(cluster_similarity_matrix_test[target_cluster_index_test])[::-1][:n_recommendations_test]
print(f"Top {n_recommendations_test} Recipe recommendations (items closest to cluster {target_cluster_index_test}):")
for i, idx in enumerate(cluster_similar_indices_test):
print(f"{i+1}. Recipe {idx}, Cosine Similarity: {cluster_similarity_matrix_test[target_cluster_index_test, idx]}")


이런식! 여기서 이제 이 테스트 데이터의 레시피들을 종합적으로 엮어서 가장 가까운 하나의 군집을 알아내고 그 군집의 레시피데이터들만 출력하도록 하면 됨
테스트 데이터들의 벡터의 평균을 구하고 해당 평균값이 속하는 군집 한개를 뽑으면 되지 않을까 싶음
+코사인 유사도 추가로 그 군집 내 데이터 중에서도 최대한 가까운걸로
->> 에러에러에러에러.........