Machine Learning [Regression] practice

Question 1.

from sklearn.datasets import load_iris
import pandas as pd
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['target_name'] = iris.target_names[iris.target]

#X standardscale
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


X, y = iris.data, iris.target
#X 값 표준화
X_sc = scaler.fit_transform(X)
X_test_sc = scaler.transform(X_test)
df.head()

from sklearn.model_selection import train_test_split

#train data와 타겟 분리

X_train, X_test, y_train, y_test = train_test_split(X_sc, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

#result
(120, 4) (30, 4) (120,) (30,)

import seaborn as sns
sns.pairplot(df, vars=['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'], hue='target_name')

len(df)
#result
150

sns.boxplot(data=df[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int64  
 5   target_name        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB

from sklearn.linear_model import LogisticRegression
model_lor = LogisticRegression()
model_lor.fit(X_train, y_train)


LogisticRegression
LogisticRegression()

def get_att(x, feature_names):
    print('클래스 종류:', x.classes_)
    print('독립변수 갯수:', x.coef_.shape[1])
    print('들어간 독립변수(x)의 이름:', feature_names)
    print('가중치:', x.coef_)
    print('바이어스:', x.intercept_)

get_att(model_lor, df.columns[:-2])  # df의 마지막 열은 타겟 변수이므로 제외한 나머지 열을 독립변수의 이름으로 사용합니다.

#result
클래스 종류: [0 1 2]
독립변수 갯수: 4
들어간 독립변수(x)의 이름: Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')
가중치: [[-1.0205085   1.13224015 -1.81712983 -1.68691733]
 [ 0.53384517 -0.2826248  -0.34271392 -0.73094819]
 [ 0.48666333 -0.84961535  2.15984376  2.41786552]]
바이어스: [-0.24838294  1.97381496 -1.72543202]

from sklearn.metrics import accuracy_score

# 모델을 사용하여 테스트 데이터에 대한 예측값을 생성합니다.
y_pred = model_lor.predict(X_test)

# 예측값과 실제 타겟값을 비교하여 정확도를 계산합니다.
accuracy = accuracy_score(y_test, y_pred)

# 정확도를 출력합니다.
print("Accuracy:", accuracy)

#result
Accuracy: 1.0

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Train 데이터의 예측값과 실제 타겟값을 3차원 산점도로 그립니다.
fig = plt.figure(figsize=(12, 6))
ax1 = fig.add_subplot(121, projection='3d')
ax1.scatter(X_train[:, 0], X_train[:, 1], X_train[:, 2], c=y_train, cmap='viridis', label='Train Data')
ax1.set_xlabel('Feature 1')
ax1.set_ylabel('Feature 2')
ax1.set_zlabel('Feature 3')
ax1.set_title('Train Data')

# Test 데이터의 예측값과 실제 타겟값을 3차원 산점도로 그립니다.
ax2 = fig.add_subplot(122, projection='3d')
ax2.scatter(X_test[:, 0], X_test[:, 1], X_test[:, 2], c=y_pred, cmap='viridis', marker='^', label='Test Predictions')
ax2.set_xlabel('Feature 1')
ax2.set_ylabel('Feature 2')
ax2.set_zlabel('Feature 3')
ax2.set_title('Test Predictions')

plt.show()

Question 3.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor 
from sklearn.metrics import mean_squared_error

# 3. 모델 학습

model = DecisionTreeRegressor(max_depth=5, random_state=42)
model.fit(X_train, y_train)

#result
DecisionTreeRegressor
DecisionTreeRegressor(max_depth=5, random_state=42)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)

#result
MSE: 0.0

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

시각화

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')  # 대각선
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.show()

Question 5.

from sklearn.datasets import load_iris
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# KMeans 클러스터링
kmeans = KMeans(n_clusters=3, n_init=10, random_state=42)
clusters = kmeans.fit_predict(X_train)

# 클러스터링 결과를 train 데이터프레임에 추가
df_train = pd.DataFrame(X_train, columns=iris.feature_names)
df_train['target'] = y_train
df_train['cluster'] = clusters


print(df_train.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0          -1.506521          1.249201          -1.567576         -1.315444   
1          -0.173674          3.090775          -1.283389         -1.052180   
2           1.038005          0.098217           0.364896          0.264142   
3          -1.264185          0.788808          -1.226552         -1.315444   
4          -1.748856          0.328414          -1.397064         -1.315444   

   target  cluster  
0       0        0  
1       0        0  
2       1        1  
3       0        0  
4       0        0

import matplotlib.pyplot as plt
import seaborn as sns

# 클러스터링 결과 시각화 (train 데이터)
plt.figure(figsize=(12, 6))

# subplot 1: 클러스터 레이블
plt.subplot(1, 2, 1)
sns.scatterplot(data=df_train, x='sepal length (cm)', y='sepal width (cm)',
                hue='cluster', palette='viridis', style='cluster', s=100)
plt.title('KMeans Clustering Results')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.legend(title='Cluster')

# subplot 2: 실제 타겟 레이블
plt.subplot(1, 2, 2)
sns.scatterplot(data=df_train, x='sepal length (cm)', y='sepal width (cm)',
                hue='target', palette='viridis', style='target', s=100)
plt.title('Actual Target Labels')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.legend(title='Target')

plt.tight_layout()
plt.show()

import numpy as np
from sklearn.metrics import accuracy_score
from scipy.stats import mode

# 각 클러스터에 대한 가장 빈번한 실제 타겟 레이블을 찾기
labels = np.zeros_like(clusters)
for i in range(3):
    mask = (clusters == i)
    labels[mask] = mode(y_train[mask])[0]

# 정확도 계산
accuracy = accuracy_score(y_train, labels)
print(f'Clustering Accuracy: {accuracy:.2f}')

#result
Clustering Accuracy: 0.80

Question 2.

X = np.load("/content/sample_data/cal_X.npy")
y = np.load("/content/sample_data/cal_y.npy")

X
array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])
          
          
y
array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

sns.scatterplot(x=X[:, 0], y=y)

# train과 test 데이터로 분리 (8:2 비율)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LinearRegression
house_model = LinearRegression()
house_model.fit(X_train, y_train)


LinearRegression
LinearRegression()

from sklearn.metrics import mean_squared_error
y_pred = house_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse}")

#result
MSE: 0.5558915986952422

Question 4.

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic = pd.read_csv(url)

titanic.head()

def Sex_en(x):
  if x == 'male':
    return 0
  else:
    return 1
titanic['Sex_en'] = titanic['Sex'].apply(Sex_en)


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# 필요한 특성 선택
features = ['Pclass', 'Sex_en', 'Age', 'SibSp', 'Parch', 'Fare']

# 입력 변수 X와 타겟 변수 y 설정
X = titanic[features]
y = titanic['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.2, random_state=42)

titanic['Age'] = titanic['Age'].fillna(titanic['Age'].mean())

titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  Sex_en       891 non-null    int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)


RandomForestClassifier
RandomForestClassifier()

from sklearn.metrics import f1_score
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)  
print("f1_score:", f1_score)  

#result
Accuracy: 0.8044692737430168
f1_score: 0.7552447552447553

'Study Note > Python' 카테고리의 다른 글

Machine Learning [scikit-learn] practice (0)	2024.06.04
pygwalker (0)	2024.05.30
Interactive Graphs with Altair (0)	2024.05.30
Multiple graphs (0)	2024.05.30
Dual-axis graph and Pyramid graph (0)	2024.05.30

Junhyun Lee (Finance Accounting & Data)

Machine Learning [Regression] practice

'Study Note > Python' 카테고리의 다른 글

티스토리툴바

Machine Learning [Regression] practice

'Study Note > Python' 카테고리의 다른 글

관련글

티스토리툴바