# 引入sklearniris数据
from sklearn.datasets import load_iris
# 划分数据,训练/验证数据
from sklearn.model_selection import train_test_split
# 转换器,转换为sparse矩阵或one-hot编码矩阵
from sklearn.feature_extraction import DictVectorizer
# 文本tf-idf值计算,转换为sparse矩阵或one-hot编码矩阵
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# 中文分词
import jieba
# 读取文件
import pandas as pd
# 归一化处理数据
from sklearn.preprocessing import MinMaxScaler
# 标准化处理数据
from sklearn.preprocessing import StandardScaler
# 降维
from sklearn.feature_selection import VarianceThreshold
# 相关系数计算
from scipy.stats import pearsonr
# 可视化展示
import matplotlib.pyplot as plt
# PCA降维
from sklearn.decomposition import PCA


def pca_demo():
    """
    PCA降维
    :return:
    """
    # 1、读取数据
    data = [[2, 8, 4, 5],
            [6, 3, 0, 8],
            [5, 4, 9, 1]]
    print(data)
    # 2、实例化一个转换器
    transfer = PCA(n_components=0.95)
    # 3、调用PCA()
    data_new = transfer.fit_transform(data)
    print("data_new:\n", data_new)
    return None


# def variance_demo():
#     """
#         过滤低方差特征
#         :return:
#         """
#     # 1、读取数据
#     data = pd.read_csv("factor_returns.csv")
#     print(data)
#     # 2、实例化一个转换器
#     transfer = VarianceThreshold(threshold=0)
#     # 3、调用VarianceThreshold()
#     data_new = transfer.fit_transform(data.iloc[:, 1:10])
#     print("data_new:\n", data_new, data_new.shape)
#     # 4、计算两个变量之间的相关系数
#     r1 = pearsonr(data["pe_ratio"], data["pb_ratio"])
#     print("相关系数:\n", r1)
#     r2 = pearsonr(data['revenue'], data['total_expense'])
#     print("相关系数:\n", r2)
#     # 画图展示
#     plt.figure(figsize=(20, 8), dpi=100)
#     plt.scatter(data['revenue'], data['total_expense'])
#     plt.show()
#     return None


if __name__ == '__main__':
    pca_demo()


# def standar_demo():
#     """
#     数据标准化处理 > 归一化处理
#     :return:
#     """
#     # 1、读取数据
#     data = pd.read_csv("dating.txt")
#     data = data.iloc[:, :3]
#     print(data)
#     # 2、实例化一个转换器
#     transfer = StandardScaler()
#     # 3、调用MinMaxScaler()
#     data_new = transfer.fit_transform(data)
#     print("data_new:\n", data_new)
#     return None


# def minmax_demo():
#     """
#     数据归一化处理
#     :return:
#     """
#     # 1、读取数据
#     data = pd.read_csv("dating.txt")
#     data = data.iloc[:, :3]
#     print(data)
#     # 2、实例化一个转换器
#     transfer = MinMaxScaler(feature_range=[0, 1])
#     # 3、调用MinMaxScaler()
#     data_new = transfer.fit_transform(data)
#     print("data_new:\n", data_new)
#     return None


def cut_word(text):
    """
    中文分词
    :return:
    """
    words = " ".join(list(jieba.cut(text)))
    return words


# def datasets_demo():
#     iris = load_iris()
#     x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22)
#     print("特征值:\n", x_train, x_train.shape)
#     return None


# def dict_demo():
#     """
#     字典特征提取
#     :return:
#     """
#     data = [{'city': '北京', 'tempeerature': 100}, {'city': '上海', 'tempeerature': 60}, {'city': '深圳', 'tempeerature': 30}]
#     # 1、实例化一个转换器
#     transfer = DictVectorizer(sparse=False)
#     # 2、调用fit_transform()
#     data_new = transfer.fit_transform(data)
#     print("data_new:\n", data_new)
#     return None

# def count_demo():
#     """
#     文本特征提取
#     :return:
#     """
#     data = ["life is short,i like like python", "life is short,i dislike python"]
#     # 1、实例化一个转换器
#     transfer = CountVectorizer()
#     # 2、调用fit_transform()
#     data_new = transfer.fit_transform(data)
#     print("data_new:\n", data_new.toarray())
#     print("特征名字:\n", transfer.get_feature_names())
#     return None

# def count_chinese_demo():
#     """
#     中文文本特征提取
#     :return:
#     """
#     data = ["今天bai很残酷,明天du更残酷,后天会zhi很美好,但绝大多数人都死dao在明天zhuan晚上。”原话是这个的shu",
#             "你能去欣赏每个人、每个员工的时候,你才是最好的。年轻人首先要学会用欣赏的眼光去看待别人。生活是残酷的,今天很残酷,明天更残酷。今天必须努力,才能活到明",
#             "马云的话,可能只漏了一点,完整版或许应该是这样的"]
#     # 1、将中文文本进行分词
#     data1 = []
#     for sent in data:
#         data1.append(cut_word(sent))
#     # 1、实例化一个转换器
#     transfer = CountVectorizer()
#     # 2、调用fit_transform()
#     data_new = transfer.fit_transform(data1)
#     print("data_new:\n", data_new.toarray())
#     print("特征名字:\n", transfer.get_feature_names())
#     return None


def tfidf_demo():
    """
    TF-IDF方法进行文本特征提取
    :return:
    """
    data = ["今天bai很残酷,明天du更残酷,后天会zhi很美好,但绝大多数人都死dao在明天zhuan晚上。”原话是这个的shu",
            "你能去欣赏每个人、每个员工的时候,你才是最好的。年轻人首先要学会用欣赏的眼光去看待别人。生活是残酷的,今天很残酷,明天更残酷。今天必须努力,才能活到明",
            "马云的话,可能只漏了一点,完整版或许应该是这样的"]
    # 1、将中文文本进行分词
    data1 = []
    for sent in data:
        data1.append(cut_word(sent))
    # 1、实例化一个转换器
    transfer = TfidfVectorizer()
    # 2、调用fit_transform()
    data_new = transfer.fit_transform(data1)
    print("data_new:\n", data_new.toarray())
    print("特征名字:\n", transfer.get_feature_names())
    return None

dating.txt:

milage,Liters,Consumtime,target
40920,8.326976,0.953952,3
14488,7.153469,1.673904,2
26052,1.441871,0.805124,1
75136,13.147394,0.428964,1
38344,1.669788,0.134296,1

factor_returns:

index,pe_ratio,pb_ratio,market_cap,return_on_asset_net_profit,du_return_on_equity,ev,earnings_per_share,revenue,total_expense,date,return
0,000001.XSHE,5.9572,1.1818,85252550922.0,0.8008,14.9403,1211444855670.0,2.01,20701401000.0,10882540000.0,2012-01-31,0.027657228229937388
1,000002.XSHE,7.0289,1.588,84113358168.0,1.6463,7.8656,300252061695.0,0.326,29308369223.2,23783476901.2,2012-01-31,0.08235182370820669
2,000008.XSHE,-262.7461,7.0003,517045520.0,-0.5678,-0.5943,770517752.56,-0.006,11679829.03,12030080.04,2012-01-31,0.09978900335112327
3,000060.XSHE,16.476,3.7146,19680455995.0,5.6036,14.617,28009159184.6,0.35,9189386877.65,7935542726.05,2012-01-31,0.12159482758620697
4,000069.XSHE,12.5878,2.5616,41727214853.0,2.8729,10.9097,81247380359.0,0.271,8951453490.28,7091397989.13,2012-01-31,-0.0026808154146886697
Logo

讨论HarmonyOS开发技术,专注于API与组件、DevEco Studio、测试、元服务和应用上架分发等。

更多推荐