Python sklearn机器学习基础学习笔记1
# 引入sklearniris数据from sklearn.datasets import load_iris# 划分数据,训练/验证数据from sklearn.model_selection import train_test_split# 转换器,转换为sparse矩阵或one-hot编码矩阵from sklearn.feature_extraction import DictVectori
·
# 引入sklearniris数据
from sklearn.datasets import load_iris
# 划分数据,训练/验证数据
from sklearn.model_selection import train_test_split
# 转换器,转换为sparse矩阵或one-hot编码矩阵
from sklearn.feature_extraction import DictVectorizer
# 文本tf-idf值计算,转换为sparse矩阵或one-hot编码矩阵
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# 中文分词
import jieba
# 读取文件
import pandas as pd
# 归一化处理数据
from sklearn.preprocessing import MinMaxScaler
# 标准化处理数据
from sklearn.preprocessing import StandardScaler
# 降维
from sklearn.feature_selection import VarianceThreshold
# 相关系数计算
from scipy.stats import pearsonr
# 可视化展示
import matplotlib.pyplot as plt
# PCA降维
from sklearn.decomposition import PCA
def pca_demo():
"""
PCA降维
:return:
"""
# 1、读取数据
data = [[2, 8, 4, 5],
[6, 3, 0, 8],
[5, 4, 9, 1]]
print(data)
# 2、实例化一个转换器
transfer = PCA(n_components=0.95)
# 3、调用PCA()
data_new = transfer.fit_transform(data)
print("data_new:\n", data_new)
return None
# def variance_demo():
# """
# 过滤低方差特征
# :return:
# """
# # 1、读取数据
# data = pd.read_csv("factor_returns.csv")
# print(data)
# # 2、实例化一个转换器
# transfer = VarianceThreshold(threshold=0)
# # 3、调用VarianceThreshold()
# data_new = transfer.fit_transform(data.iloc[:, 1:10])
# print("data_new:\n", data_new, data_new.shape)
# # 4、计算两个变量之间的相关系数
# r1 = pearsonr(data["pe_ratio"], data["pb_ratio"])
# print("相关系数:\n", r1)
# r2 = pearsonr(data['revenue'], data['total_expense'])
# print("相关系数:\n", r2)
# # 画图展示
# plt.figure(figsize=(20, 8), dpi=100)
# plt.scatter(data['revenue'], data['total_expense'])
# plt.show()
# return None
if __name__ == '__main__':
pca_demo()
# def standar_demo():
# """
# 数据标准化处理 > 归一化处理
# :return:
# """
# # 1、读取数据
# data = pd.read_csv("dating.txt")
# data = data.iloc[:, :3]
# print(data)
# # 2、实例化一个转换器
# transfer = StandardScaler()
# # 3、调用MinMaxScaler()
# data_new = transfer.fit_transform(data)
# print("data_new:\n", data_new)
# return None
# def minmax_demo():
# """
# 数据归一化处理
# :return:
# """
# # 1、读取数据
# data = pd.read_csv("dating.txt")
# data = data.iloc[:, :3]
# print(data)
# # 2、实例化一个转换器
# transfer = MinMaxScaler(feature_range=[0, 1])
# # 3、调用MinMaxScaler()
# data_new = transfer.fit_transform(data)
# print("data_new:\n", data_new)
# return None
def cut_word(text):
"""
中文分词
:return:
"""
words = " ".join(list(jieba.cut(text)))
return words
# def datasets_demo():
# iris = load_iris()
# x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22)
# print("特征值:\n", x_train, x_train.shape)
# return None
# def dict_demo():
# """
# 字典特征提取
# :return:
# """
# data = [{'city': '北京', 'tempeerature': 100}, {'city': '上海', 'tempeerature': 60}, {'city': '深圳', 'tempeerature': 30}]
# # 1、实例化一个转换器
# transfer = DictVectorizer(sparse=False)
# # 2、调用fit_transform()
# data_new = transfer.fit_transform(data)
# print("data_new:\n", data_new)
# return None
# def count_demo():
# """
# 文本特征提取
# :return:
# """
# data = ["life is short,i like like python", "life is short,i dislike python"]
# # 1、实例化一个转换器
# transfer = CountVectorizer()
# # 2、调用fit_transform()
# data_new = transfer.fit_transform(data)
# print("data_new:\n", data_new.toarray())
# print("特征名字:\n", transfer.get_feature_names())
# return None
# def count_chinese_demo():
# """
# 中文文本特征提取
# :return:
# """
# data = ["今天bai很残酷,明天du更残酷,后天会zhi很美好,但绝大多数人都死dao在明天zhuan晚上。”原话是这个的shu",
# "你能去欣赏每个人、每个员工的时候,你才是最好的。年轻人首先要学会用欣赏的眼光去看待别人。生活是残酷的,今天很残酷,明天更残酷。今天必须努力,才能活到明",
# "马云的话,可能只漏了一点,完整版或许应该是这样的"]
# # 1、将中文文本进行分词
# data1 = []
# for sent in data:
# data1.append(cut_word(sent))
# # 1、实例化一个转换器
# transfer = CountVectorizer()
# # 2、调用fit_transform()
# data_new = transfer.fit_transform(data1)
# print("data_new:\n", data_new.toarray())
# print("特征名字:\n", transfer.get_feature_names())
# return None
def tfidf_demo():
"""
TF-IDF方法进行文本特征提取
:return:
"""
data = ["今天bai很残酷,明天du更残酷,后天会zhi很美好,但绝大多数人都死dao在明天zhuan晚上。”原话是这个的shu",
"你能去欣赏每个人、每个员工的时候,你才是最好的。年轻人首先要学会用欣赏的眼光去看待别人。生活是残酷的,今天很残酷,明天更残酷。今天必须努力,才能活到明",
"马云的话,可能只漏了一点,完整版或许应该是这样的"]
# 1、将中文文本进行分词
data1 = []
for sent in data:
data1.append(cut_word(sent))
# 1、实例化一个转换器
transfer = TfidfVectorizer()
# 2、调用fit_transform()
data_new = transfer.fit_transform(data1)
print("data_new:\n", data_new.toarray())
print("特征名字:\n", transfer.get_feature_names())
return None
dating.txt:
milage,Liters,Consumtime,target
40920,8.326976,0.953952,3
14488,7.153469,1.673904,2
26052,1.441871,0.805124,1
75136,13.147394,0.428964,1
38344,1.669788,0.134296,1
factor_returns:
index,pe_ratio,pb_ratio,market_cap,return_on_asset_net_profit,du_return_on_equity,ev,earnings_per_share,revenue,total_expense,date,return
0,000001.XSHE,5.9572,1.1818,85252550922.0,0.8008,14.9403,1211444855670.0,2.01,20701401000.0,10882540000.0,2012-01-31,0.027657228229937388
1,000002.XSHE,7.0289,1.588,84113358168.0,1.6463,7.8656,300252061695.0,0.326,29308369223.2,23783476901.2,2012-01-31,0.08235182370820669
2,000008.XSHE,-262.7461,7.0003,517045520.0,-0.5678,-0.5943,770517752.56,-0.006,11679829.03,12030080.04,2012-01-31,0.09978900335112327
3,000060.XSHE,16.476,3.7146,19680455995.0,5.6036,14.617,28009159184.6,0.35,9189386877.65,7935542726.05,2012-01-31,0.12159482758620697
4,000069.XSHE,12.5878,2.5616,41727214853.0,2.8729,10.9097,81247380359.0,0.271,8951453490.28,7091397989.13,2012-01-31,-0.0026808154146886697
更多推荐


所有评论(0)