카테고리 없음

밥디 진행상황2

토끼곰 2024. 2. 7. 18:43

코랩

 

기존버전 (kmeans+ Randomforest + XGboost)

 

 

 

Extra trees 까지 추가한 버전 

 

import pandas as pd
from gensim.models import Word2Vec
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from google.colab import drive

def record_to_vector(model, row):
    ingredients_list = re.split(',\s*', row['RCP_PARTS_DTLS'])

    ingredients_vector_list = [model.wv[ingredient] for ingredient in ingredients_list if ingredient in model.wv]

    if ingredients_vector_list:
        ingredients_vector = np.mean(ingredients_vector_list, axis=0)
    else:
        ingredients_vector = np.zeros(model.vector_size)

    category_value = row['RCP_PAT2'].replace(" ", "_")

    if category_value in model.wv:
        category_vector = model.wv[category_value]
    else:
        category_vector = np.zeros(model.vector_size)

    return ingredients_vector, category_vector

# 데이터 불러오기
drive.mount('/content/drive')

df_train = pd.read_csv(csv_link)

# 데이터 전처리
df_train = df_train[df_train['RCP_PARTS_DTLS'].apply(lambda x: isinstance(x, str)) & df_train['RCP_PAT2'].apply(lambda x: isinstance(x, str))]

# 토큰화 및 Word2Vec 모델 생성
tokenized_train_data = [re.split(',\s*', recipe) + [category] for recipe, category in zip(df_train['RCP_PARTS_DTLS'], df_train['RCP_PAT2'])]
model = Word2Vec(sentences=tokenized_train_data, vector_size=100, window=5, min_count=1, workers=4)

# 데이터를 벡터로 변환
vectors_train = [record_to_vector(model, row) for _, row in df_train.iterrows()]
ingredients_vectors_train, category_vectors_train = zip(*vectors_train)
ingredients_vectors_train = np.array(ingredients_vectors_train)
category_vectors_train = np.array(category_vectors_train)

# KMeans 클러스터링 수행
combined_vectors_train = np.hstack([ingredients_vectors_train, category_vectors_train])
kmeans = KMeans(n_clusters=60, random_state=42)
df_train['cluster'] = kmeans.fit_predict(combined_vectors_train)
cluster_centers_train = kmeans.cluster_centers_

# 학습 및 테스트를 위해 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(combined_vectors_train, df_train['cluster'], test_size=0.2, random_state=42)

# Random Forest와 XGBoost 모델 학습
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

xgb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# 테스트 데이터 불러오기
df_test = pd.read_csv(csv_link_test)

# 테스트 데이터 전처리
df_test = df_test[df_test['RCP_PARTS_DTLS'].apply(lambda x: isinstance(x, str)) & df_test['RCP_PAT2'].apply(lambda x: isinstance(x, str))]

# 토큰화 및 Word2Vec 모델을 사용하여 테스트 데이터 벡터로 변환
tokenized_test_data = [re.split(',\s*', recipe) + [category] for recipe, category in zip(df_test['RCP_PARTS_DTLS'], df_test['RCP_PAT2'])]
vectors_test = [record_to_vector(model, row) for _, row in df_test.iterrows()]
ingredients_vectors_test, category_vectors_test = zip(*vectors_test)
ingredients_vectors_test = np.array(ingredients_vectors_test)
category_vectors_test = np.array(category_vectors_test)

# 전체 테스트 데이터 종합
combined_test_data = pd.concat([df_test['RCP_PARTS_DTLS'], df_test['RCP_PAT2']], axis=1)

# 전체 테스트 데이터에 대한 벡터 생성
vectors_combined_test = [record_to_vector(model, row) for _, row in combined_test_data.iterrows()]
ingredients_vectors_combined_test, category_vectors_combined_test = zip(*vectors_combined_test)
ingredients_vectors_combined_test = np.array(ingredients_vectors_combined_test)
category_vectors_combined_test = np.array(category_vectors_combined_test)

# 테스트 데이터에 대한 예측
combined_vectors_combined_test = np.hstack([ingredients_vectors_combined_test, category_vectors_combined_test])
rf_predictions_combined_test = rf_model.predict(combined_vectors_combined_test)
xgb_predictions_combined_test = xgb_model.predict(combined_vectors_combined_test)

# 단순 평균화 방법으로 예측을 결합
hybrid_predictions_combined_test = (rf_predictions_combined_test + xgb_predictions_combined_test) / 2

# 전체 테스트 레시피에 대한 가장 유사한 레시피 추천
cluster_similarity_matrix_combined_test = cosine_similarity(cluster_centers_train, combined_vectors_combined_test)
target_cluster_indices_combined_test = np.argmax(cluster_similarity_matrix_combined_test, axis=0)

# 클러스터 예측과 하이브리드 모델 예측 결합
final_predictions_combined_test = (hybrid_predictions_combined_test + target_cluster_indices_combined_test) / 2

# 추천 결과 출력
n_recommendations_combined_test = 5
cluster_similar_indices_combined_test = np.argsort(final_predictions_combined_test)[::-1][:n_recommendations_combined_test]

print(f"Top {n_recommendations_combined_test} Recipe recommendations for the combined test data:")
for i, idx in enumerate(cluster_similar_indices_combined_test):
    print(f"{i+1}. Recipe {idx}, Prediction: {final_predictions_combined_test[idx]}")

 

 

 

임시최종 버전 (kmeans + Randomforest + XGboost + Extra trees / 모델 병합 방식: 평균화)

import pandas as pd
from gensim.models import Word2Vec
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from IPython.display import Image
from google.colab import drive

def record_to_vector(model, row):
    ingredients_list = re.split(',\s*', row['RCP_PARTS_DTLS'])

    ingredients_vector_list = [model.wv[ingredient] for ingredient in ingredients_list if ingredient in model.wv]

    if ingredients_vector_list:
        ingredients_vector = np.mean(ingredients_vector_list, axis=0)
    else:
        ingredients_vector = np.zeros(model.vector_size)

    category_value = row['RCP_PAT2'].replace(" ", "_")

    if category_value in model.wv:
        category_vector = model.wv[category_value]
    else:
        category_vector = np.zeros(model.vector_size)

    return ingredients_vector, category_vector

# 데이터 불러오기
drive.mount('/content/drive')

df_train = pd.read_csv(csv_link)

# 데이터 전처리
df_train = df_train[df_train['RCP_PARTS_DTLS'].apply(lambda x: isinstance(x, str)) & df_train['RCP_PAT2'].apply(lambda x: isinstance(x, str))]

# 토큰화 및 Word2Vec 모델 생성
tokenized_train_data = [re.split(',\s*', recipe) + [category] for recipe, category in zip(df_train['RCP_PARTS_DTLS'], df_train['RCP_PAT2'])]
model = Word2Vec(sentences=tokenized_train_data, vector_size=100, window=5, min_count=1, workers=4)

# 데이터를 벡터로 변환
vectors_train = [record_to_vector(model, row) for _, row in df_train.iterrows()]
ingredients_vectors_train, category_vectors_train = zip(*vectors_train)
ingredients_vectors_train = np.array(ingredients_vectors_train)
category_vectors_train = np.array(category_vectors_train)

# KMeans 클러스터링 수행
combined_vectors_train = np.hstack([ingredients_vectors_train, category_vectors_train])
kmeans = KMeans(n_clusters=60, random_state=42)
df_train['cluster'] = kmeans.fit_predict(combined_vectors_train)
cluster_centers_train = kmeans.cluster_centers_

# 학습 및 테스트를 위해 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(combined_vectors_train, df_train['cluster'], test_size=0.2, random_state=42)

# RandomForest, Extra Trees, 그리고 XGBoost 모델 학습
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

et_model = ExtraTreesClassifier(n_estimators=100, random_state=42)
et_model.fit(X_train, y_train)

xgb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# 테스트 데이터 불러오기
df_test = pd.read_csv(csv_link_test)

# 테스트 데이터 전처리
df_test = df_test[df_test['RCP_PARTS_DTLS'].apply(lambda x: isinstance(x, str)) & df_test['RCP_PAT2'].apply(lambda x: isinstance(x, str))]

# 토큰화 및 Word2Vec 모델을 사용하여 테스트 데이터 벡터로 변환
tokenized_test_data = [re.split(',\s*', recipe) + [category] for recipe, category in zip(df_test['RCP_PARTS_DTLS'], df_test['RCP_PAT2'])]
vectors_test = [record_to_vector(model, row) for _, row in df_test.iterrows()]
ingredients_vectors_test, category_vectors_test = zip(*vectors_test)
ingredients_vectors_test = np.array(ingredients_vectors_test)
category_vectors_test = np.array(category_vectors_test)

# 전체 테스트 데이터 종합
combined_test_data = pd.concat([df_test['RCP_PARTS_DTLS'], df_test['RCP_PAT2']], axis=1)

# 전체 테스트 데이터에 대한 벡터 생성
vectors_combined_test = [record_to_vector(model, row) for _, row in combined_test_data.iterrows()]
ingredients_vectors_combined_test, category_vectors_combined_test = zip(*vectors_combined_test)
ingredients_vectors_combined_test = np.array(ingredients_vectors_combined_test)
category_vectors_combined_test = np.array(category_vectors_combined_test)

# 테스트 데이터에 대한 예측
combined_vectors_combined_test = np.hstack([ingredients_vectors_combined_test, category_vectors_combined_test])
rf_predictions_combined_test = rf_model.predict(combined_vectors_combined_test)
et_predictions_combined_test = et_model.predict(combined_vectors_combined_test)
xgb_predictions_combined_test = xgb_model.predict(combined_vectors_combined_test)

# 단순 평균화 방법으로 예측을 결합
hybrid_predictions_combined_test = (rf_predictions_combined_test + et_predictions_combined_test + xgb_predictions_combined_test) / 3

# 전체 테스트 레시피에 대한 가장 유사한 레시피 추천
cluster_similarity_matrix_combined_test = cosine_similarity(cluster_centers_train, combined_vectors_combined_test)
target_cluster_indices_combined_test = np.argmax(cluster_similarity_matrix_combined_test, axis=0)

# 클러스터 예측과 하이브리드 모델 예측 결합
final_predictions_combined_test = (hybrid_predictions_combined_test + target_cluster_indices_combined_test) / 2

df_test['Recipe_Name'] = df_test['RCP_NM']
df_test['Photo_Link'] = df_test['ATT_FILE_NO_MAIN']

# 추천 결과 출력
n_recommendations_combined_test = 5
cluster_similar_indices_combined_test = np.argsort(final_predictions_combined_test)[::-1][:n_recommendations_combined_test]

print(f"Top {n_recommendations_combined_test} Recipe recommendations for the combined test data:")
for i, idx in enumerate(cluster_similar_indices_combined_test):
    recipe_name = df_test.at[idx, 'Recipe_Name']
    photo_link = df_test.at[idx, 'Photo_Link']
    prediction = final_predictions_combined_test[idx]
   
    print(f"{i+1}. Recipe Name: {recipe_name}, Prediction: {prediction}")
    print(f"   Photo Link: {photo_link}")
    display(Image(url=photo_link))  # 이미지 표시