2/3 문제 | Notion

문제 : 랜덤 포레스트의 특성 중요도를 시각화하고 feature_importances_와 permutation_importance()의 차이를 알아봅시다.

정답 : https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html#sphx-glr-auto-examples-ensemble-plot-forest-importances-py

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# 이진 분류 데이터 생성
X, y = make_classification(
    n_samples=1000, # 샘플 개수 1000개
    n_features=10, # 전체 특성 개수 10개
    n_informative=3, # 정보가 있는(중요한) 특성 개수 3개
    n_redundant=0, # 기존 특성 조합으로 생성된 특성 개수 0개
    n_repeated=0, # 반복된 특성 개수 0개
    n_classes=2, # 클래스 개수 (이진 분류)
    random_state=0,
    shuffle=False, # 데이터를 섞지 않음
)

**# 훈련 세트와 테스트 세트 분리 (클래스 비율이 유지되도록 데이터 분할 : stratify=y)**

from sklearn.ensemble import RandomForestClassifier

**# 랜덤 포레스트 분류 모델 생성
# 모델 학습
# 각 특성의 중요도 저장**

import numpy as np

# 랜덤 포레스트 속 각 트리의 특성 중요도의 표준편차를 구해 막대 그래프의 오차 막대(검은 선)로 사용
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)

import pandas as pd
import matplotlib.pyplot as plt

# 특성 이름 생성 (feature 0, feature 1, ... , feature 9)
feature_names = [f"feature {i}" for i in range(X.shape[1])]

# 특성 중요도를 판다스 시리즈로 변환
forest_importances = pd.Series(importances, index=feature_names)

# 막대 그래프 그리기
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax) # 특성 중요도의 표준편차를 오차 막대로 사용(yerr=std)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

from sklearn.inspection import permutation_importance

**# Permutation Importance 저장 (테스트 세트를 사용, 10번 반복하여 중요도 계산 : n_repeats=10, 교재 274p에 나옴)**

# 특성 중요도를 판다스 시리즈로 변환
forest_importances = pd.Series(result.importances_mean, index=feature_names)

# 막대 그래프 그리기
fig, ax = plt.subplots()
**# Permutation Importance()는 특성 중요도의 평균과 표준편차를 담고 있어 따로 계산할 필요 없음(yerr=result.importances_std)
forest_importances.plot.bar()** 
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()