Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[황지원] week9_2024_GDG_ML입문스터디 #9

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions week3/[MLNovice]황지원_week3-1.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyN+NosSh6cOGlVX9GZGUYoV"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"mUPgayDTIuMU"},"outputs":[],"source":["# 럭키백의 확률\n","\n","# 데이터 준비하기\n","import pandas as pd\n","\n","fish = pd.read_csv('https://bit.ly/fish_csv_data')\n","fish.head() # 테이블로 출력해\n","\n","# input 데이터\n","fish_input = fish[['Weight','Length','Diagonal','Height','Width']].to_numpy()\n","# target 데이터\n","fish_target = fish['Species'].to_numpy()"]},{"cell_type":"code","source":["# 데이터 전처리\n","# train set, test set 나누기\n","from sklearn.model_selection import train_test_split\n","\n","train_input, test_input, train_target, test_target = train_test_split(\n"," fish_input, fish_target, random_state=42)\n","\n","# data scaling\n","from sklearn.preprocessing import StandardScaler\n","\n","ss = StandardScaler()\n","ss.fit(train_input)\n","train_scaled = ss.transform(train_input)\n","test_scaled = ss.transform(test_input)"],"metadata":{"id":"vUaGxPeON4Rb"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# K-최근접 이웃의 다중분류 확률 예측\n","from sklearn.neighbors import KNeighborsClassifier\n","\n","kn = KNeighborsClassifier(n_neighbors=3) # 기본값은 5, 이번엔 3으로 설정\n","kn.fit(train_scaled, train_target)\n","\n","print(kn.classes_)\n","# class가 어떤식으로 나눠졌지 확인\n","\n","import numpy as np\n","\n","# 확률 출력 (predict_proba 메소드 사용)\n","proba = kn.predict_proba(test_scaled[:5])\n","print(np.round(proba, decimals=4))\n","\n","distances, indexes = kn.kneighbors(test_scaled[3:4])\n","print(train_target[indexes])"],"metadata":{"id":"5ZCyyrihN7sA"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 시그모이드 함수 만들기\n","import numpy as np\n","import matplotlib.pyplot as plt\n","\n","z = np.arange(-5, 5, 0.1)\n","phi = 1 / (1 + np.exp(-z))\n","\n","plt.plot(z, phi)\n","plt.xlabel('z')\n","plt.ylabel('phi')\n","plt.show()"],"metadata":{"id":"taWW1UNoOClC"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 로지스틱 회귀로 이진 분류\n","bream_smelt_indexes = (train_target == 'Bream') | (train_target == 'Smelt') # 불린 인덱싱\n","train_bream_smelt = train_scaled[bream_smelt_indexes]\n","target_bream_smelt = train_target[bream_smelt_indexes]\n","\n","from sklearn.linear_model import LogisticRegression\n","\n","# 객체 만들기\n","lr = LogisticRegression()\n","lr.fit(train_bream_smelt, target_bream_smelt)\n","\n","# 샘플 5개 (도미, 빙어)\n","print(lr.predict(train_bream_smelt[:5]))\n","# 샘플에 대한 확률\n","print(lr.predict_proba(train_bream_smelt[:5]))\n","\n","# 가중치\n","print(lr.coef_, lr.intercept_)\n","\n","# z값 계산\n","decisions = lr.decision_function(train_bream_smelt[:5])\n","print(decisions)\n","\n","# scipy에서 시그모이드함수 불러오기\n","from scipy.special import expit\n","print(expit(decisions))"],"metadata":{"id":"tWN1R-diOD7t"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 로지스틱 회귀로 다중 분류\n","lr = LogisticRegression(C=20, max_iter=1000)\n","lr.fit(train_scaled, train_target)\n","\n","# 정확도 예측\n","print(lr.score(train_scaled, train_target))\n","print(lr.score(test_scaled, test_target))\n","\n","# predict_proba 메소드로 확률 출력\n","proba = lr.predict_proba(test_scaled[:5])\n","print(np.round(proba, decimals=3))\n","\n","decision = lr.decision_function(test_scaled[:5])\n","print(np.round(decision, decimals=2))\n","\n","# 소프트맥스 함수\n","from scipy.special import softmax\n","\n","proba = softmax(decision, axis=1)\n","print(np.round(proba, decimals=3))"],"metadata":{"id":"w7q7UdRFOORO"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":[],"metadata":{"id":"FRzXkTPzPqlU"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":[],"metadata":{"id":"ftJkVWXTPqtU"},"execution_count":null,"outputs":[]}]}
1 change: 1 addition & 0 deletions week3/[MLNovice]황지원_week3-2.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyPp4SoE1DvNF6xZT3xKSAFg"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"k-tsR6BKTlHO"},"outputs":[],"source":["#SGD Classifier (확률적 경사 하강법 분류기)"]},{"cell_type":"code","source":["# 데이터 전처리\n","import pandas as pd\n","\n","fish = pd.read_csv('https://bit.ly/fish_csv_data')\n","\n","fish_input = fish[['Weight','Length','Diagonal','Height','Width']].to_numpy()\n","fish_target = fish['Species'].to_numpy()\n","\n","from sklearn.model_selection import train_test_split\n","\n","train_input, test_input, train_target, test_target = train_test_split(\n"," fish_input, fish_target, random_state=42)\n","\n","from sklearn.preprocessing import StandardScaler\n","\n","ss = StandardScaler()\n","ss.fit(train_input)\n","train_scaled = ss.transform(train_input)\n","test_scaled = ss.transform(test_input)"],"metadata":{"id":"bHmNSvnpTmxX"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 사이킷런에서 제공하는 SGD 모델\n","from sklearn.linear_model import SGDClassifier\n","\n","# 로지스틱 손실함수 지정\n","sc = SGDClassifier(loss='log_loss', max_iter=10, random_state=42)\n","sc.fit(train_scaled, train_target)\n","\n","# 정확도 출력\n","print(sc.score(train_scaled, train_target))\n","print(sc.score(test_scaled, test_target))\n","\n","# 이전에 훈련한 걸 다시 사용할지\n","sc.partial_fit(train_scaled, train_target)\n","\n","print(sc.score(train_scaled, train_target))\n","print(sc.score(test_scaled, test_target))"],"metadata":{"id":"HCBeluU2Tm2i"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 에포크와 과대/과소적합\n","\n","import numpy as np\n","\n","sc = SGDClassifier(loss='log_loss', random_state=42)\n","\n","train_score = []\n","test_score = []\n","\n","# partial_fit 메소드로 훈련\n","classes = np.unique(train_target)\n","for _ in range(0, 300):\n"," sc.partial_fit(train_scaled, train_target, classes=classes)\n","\n"," train_score.append(sc.score(train_scaled, train_target))\n"," test_score.append(sc.score(test_scaled, test_target))\n","\n","sc = SGDClassifier(loss='log_loss', max_iter=100, tol=None, random_state=42)\n","sc.fit(train_scaled, train_target)\n","\n","print(sc.score(train_scaled, train_target))\n","print(sc.score(test_scaled, test_target))"],"metadata":{"id":"GFBSr9OPe7Uq"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 훈련세트와 테스트셋의 성능 측정 그래프\n","import matplotlib.pyplot as plt\n","\n","plt.plot(train_score)\n","plt.plot(test_score)\n","plt.xlabel('epoch')\n","plt.ylabel('accuracy')\n","plt.show()"],"metadata":{"id":"fomxBb2IgKyN"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 소프트맥스 함수 사용할때는 손실함수에 'hinge'\n","sc = SGDClassifier(loss='hinge', max_iter=100, tol=None, random_state=42)\n","sc.fit(train_scaled, train_target)\n","\n","print(sc.score(train_scaled, train_target))\n","print(sc.score(test_scaled, test_target))"],"metadata":{"id":"5ZFYqBype7ho"},"execution_count":null,"outputs":[]}]}
1 change: 1 addition & 0 deletions week4/[MLNovice]황지원_week4-1.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyNpugaSWMPn0ue9hSmkzVix"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"5QrsbuJIMTLD"},"outputs":[],"source":["# 데이터준비\n","import pandas as pd\n","\n","wine = pd.read_csv('https://bit.ly/wine_csv_data')\n","\n","# 누락된 데이터 값이 있는지 간단히 확인\n","wine.describe()\n","\n","data = wine[['alcohol', 'sugar', 'pH']].to_numpy()\n","target = wine['class'].to_numpy()"]},{"cell_type":"code","source":["# 데이터 스케일링\n","from sklearn.model_selection import train_test_split\n","\n","train_input, test_input, train_target, test_target = train_test_split(\n"," data, target, test_size=0.2, random_state=42)\n","\n","\n","from sklearn.preprocessing import StandardScaler\n","\n","ss = StandardScaler()\n","ss.fit(train_input)\n","\n","train_scaled = ss.transform(train_input)\n","test_scaled = ss.transform(test_input)"],"metadata":{"id":"z61hBwLES0oq"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 로지스틱 회귀\n","from sklearn.linear_model import LogisticRegression\n","\n","lr = LogisticRegression()\n","lr.fit(train_scaled, train_target)\n","\n","print(lr.score(train_scaled, train_target))\n","print(lr.score(test_scaled, test_target))\n","\n","print(lr.coef_, lr.intercept_)\n","# [[ 0.51268071 1.67335441 -0.68775646]] [1.81773456]"],"metadata":{"id":"gX6Q_8EES0rm"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 결정 트리\n","from sklearn.tree import DecisionTreeClassifier\n","\n","dt = DecisionTreeClassifier(random_state=42) # 사용할 특성을 랜덤하게 선택\n","dt.fit(train_scaled, train_target)\n","\n","# 트리 구조 보기\n","import matplotlib.pyplot as plt\n","from sklearn.tree import plot_tree\n","\n","plt.figure(figsize=(10,7))\n","plot_tree(dt)\n","plt.show()"],"metadata":{"id":"hq3oVCKYS0t4"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 트리 구조 간단히 보기\n","plt.figure(figsize=(10,7))\n","plot_tree(dt, max_depth=1, filled=True, feature_names=['alcohol', 'sugar', 'pH'])\n","plt.show()"],"metadata":{"id":"KR1woCnmS0wM"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 가지치기\n","dt = DecisionTreeClassifier(max_depth=3, random_state=42)\n","# max_depth로 트리의 깊이 조절\n","dt.fit(train_scaled, train_target)\n","\n","plt.figure(figsize=(20,15))\n","plot_tree(dt, filled=True, feature_names=['alcohol', 'sugar', 'pH'])\n","plt.show()\n","\n","dt = DecisionTreeClassifier(max_depth=3, random_state=42)\n","dt.fit(train_input, train_target)\n","\n","plt.figure(figsize=(20,15))\n","plot_tree(dt, filled=True, feature_names=['alcohol', 'sugar', 'pH'])\n","plt.show()"],"metadata":{"id":"Hn-P2w22S8YQ"},"execution_count":null,"outputs":[]}]}
1 change: 1 addition & 0 deletions week4/[MLNovice]황지원_week4-2.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyNy8oz0bYMZiwhj1kBoh0/Z"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"nDI_gzaCTPjJ"},"outputs":[],"source":["# 검증 세트\n","\n","# 데이터\n","import pandas as pd\n","\n","wine = pd.read_csv('https://bit.ly/wine_csv_data')\n","data = wine[['alcohol', 'sugar', 'pH']].to_numpy()\n","target = wine['class'].to_numpy()\n","\n","# train, test 나누기\n","from sklearn.model_selection import train_test_split\n","\n","train_input, test_input, train_target, test_target = train_test_split(\n"," data, target, test_size=0.2, random_state=42)\n","\n","sub_input, val_input, sub_target, val_target = train_test_split(\n"," train_input, train_target, test_size=0.2, random_state=42)\n","\n","from sklearn.tree import DecisionTreeClassifier\n","\n","dt = DecisionTreeClassifier(random_state=42)\n","dt.fit(sub_input, sub_target)"]},{"cell_type":"code","source":["# 교차 검증\n","from sklearn.model_selection import cross_validate\n","\n","# 처음 나눈 세트로 교차검증\n","scores = cross_validate(dt, train_input, train_target)\n","\n","# 검증의 점수 평균 내보기\n","import numpy as np\n","print(np.mean(scores['test_score']))"],"metadata":{"id":"-n5UDAQ7bGsQ"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 분할기를 사용한 교차 검증\n","from sklearn.model_selection import StratifiedKFold\n","\n","scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())\n","\n","\n","splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)\n","scores = cross_validate(dt, train_input, train_target, cv=splitter"],"metadata":{"id":"FEygEuZsbGv0"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 하이퍼 파라미터 튜닝\n","from sklearn.model_selection import GridSearchCV\n","\n","# 매개변수를 딕셔너리에 정의\n","params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}\n","\n","gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)\n","gs.fit(train_input, train_target)\n","\n","# 최적의 값을 찾아서 best_estimator_에 넣어줌\n","dt = gs.best_estimator_\n","print(dt.score(train_input, train_target))\n","\n","print(gs.best_params_)\n","print(gs.cv_results_['mean_test_score'])"],"metadata":{"id":"5pD2hPx0hZt6"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 확률 분포 선택\n","from scipy.stats import uniform, randint # 균등 분포 샘플링\n","\n","rgen = randint(0, 10)\n","rgen.rvs(10)\n","\n","np.unique(rgen.rvs(1000), return_counts=True)\n","\n","ugen = uniform(0, 1) # 실수값을 샘플링\n","ugen.rvs(10)"],"metadata":{"id":"yX1FcDMzhZwq"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 랜덤 서치\n","params = {'min_impurity_decrease': uniform(0.0001, 0.001),\n"," 'max_depth': randint(20, 50),\n"," 'min_samples_split': randint(2, 25),\n"," 'min_samples_leaf': randint(1, 25),\n"," }\n","\n","from sklearn.model_selection import RandomizedSearchCV # 랜덤 서치 함수\n","\n","gs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,\n"," n_iter=100, n_jobs=-1, random_state=42)\n","# n_iter : 모델 개수\n","\n","gs.fit(train_input, train_target)m sklearn.model_selection import RandomizedSearchCV\n","\n","gs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,\n"," n_iter=100, n_jobs=-1, random_state=42)\n","gs.fit(train_input, train_target)"],"metadata":{"id":"QJxx6uh2bGyu"},"execution_count":null,"outputs":[]}]}
1 change: 1 addition & 0 deletions week4/[MLNovice]황지원_week4-3.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyM4+2lCH8tm12reo9hz8z2m"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"g-eXJt1LbCY5"},"outputs":[],"source":["# 랜덤 포레스트\n","\n","# 데이터 불러오기\n","import numpy as np\n","import pandas as pd\n","from sklearn.model_selection import train_test_split\n","\n","wine = pd.read_csv('https://bit.ly/wine_csv_data')\n","\n","data = wine[['alcohol', 'sugar', 'pH']].to_numpy()\n","target = wine['class'].to_numpy()\n","\n","train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)\n","\n","# 앙상블에서 랜덤포레스트 류기\n","from sklearn.model_selection import cross_validate\n","from sklearn.ensemble import RandomForestClassifier\n","\n","rf = RandomForestClassifier(n_jobs=-1, random_state=42)\n","scores = cross_validate(rf, train_input, train_target, return_train_score=True, n_jobs=-1)\n","# 훈련세트, 검증세트 점수 확인\n","print(np.mean(scores['train_score']), np.mean(scores['test_score']))\n","\n","# OOB 샘플 : 남는 샘플데이터를 활용하여 검증\n","rf.fit(train_input, train_target)\n","rf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42)\n","rf.fit(train_input, train_target)"]},{"cell_type":"code","source":["# 엑스트라 트리\n","from sklearn.ensemble import ExtraTreesClassifier\n","\n","et = ExtraTreesClassifier(n_jobs=-1, random_state=42)\n","scores = cross_validate(et, train_input, train_target, return_train_score=True, n_jobs=-1)\n","\n","print(np.mean(scores['train_score']), np.mean(scores['test_score']))\n","\n","et.fit(train_input, train_target)\n","print(et.feature_importances_)"],"metadata":{"id":"gcS6Ifvbqmq5"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 그래디언트 부스팅\n","from sklearn.ensemble import GradientBoostingClassifier\n","\n","gb = GradientBoostingClassifier(random_state=42)\n","scores = cross_validate(gb, train_input, train_target, return_train_score=True, n_jobs=-1)\n","\n","# 트리 개수 늘리기 (500개로 늘림)\n","gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.2, random_state=42)\n","scores = cross_validate(gb, train_input, train_target, return_train_score=True, n_jobs=-1)\n","\n","gb.fit(train_input, train_target)"],"metadata":{"id":"XuvMPwC9qmtZ"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 히스토그램 기반 부스팅\n","from sklearn.ensemble import HistGradientBoostingClassifier\n","\n","hgb = HistGradientBoostingClassifier(random_state=42)\n","scores = cross_validate(hgb, train_input, train_target, return_train_score=True, n_jobs=-1)\n","\n","from sklearn.inspection import permutation_importance\n","\n","hgb.fit(train_input, train_target)\n","result = permutation_importance(hgb, train_input, train_target, n_repeats=10,random_state=42, n_jobs=-1)\n","\n","result = permutation_importance(hgb, test_input, test_target, n_repeats=10, random_state=42, n_jobs=-1)\n","hgb.score(test_input, test_target)"],"metadata":{"id":"qmHHje1UqqOL"},"execution_count":null,"outputs":[]}]}
1 change: 1 addition & 0 deletions week4/ch5_TreeAlgorithm
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

1 change: 1 addition & 0 deletions week5/Ch6 Clustering
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Ch6 Clustering
Loading