# Python探索相关性详细指南

2021年4月14日14:45:33 发表评论 819 次浏览

r越接近1，数据点越接近直线，线性关联越强。r越接近0，线性关联就越弱。

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm

data = pd.read_csv( "House Price.csv" )
data.shape

(1460, 81)

"销售价格"说明

data[ 'SalePrice' ].describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

plt.figure(figsize = ( 9 , 5 ))
data[ 'SalePrice' ].plot(kind = "hist" )

corrmat = data.corr()

f, ax = plt.subplots(figsize = ( 9 , 8 ))
sns.heatmap(corrmat, ax = ax, cmap = "YlGnBu" , linewidths = 0.1 )

corrmat = data.corr()

cg = sns.clustermap(corrmat, cmap = "YlGnBu" , linewidths = 0.1 );
plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation = 0 )

cg

# saleprice correlation matrix
# k : number of variables for heatmap
k = 15

cols = corrmat.nlargest(k, 'SalePrice' )[ 'SalePrice' ].index

cm = np.corrcoef(data[cols].values.T)
f, ax = plt.subplots(figsize = ( 12 , 10 ))

sns.heatmap(cm, ax = ax, cmap = "YlGnBu" , linewidths = 0.1 , yticklabels = cols.values, xticklabels = cols.values)