比赛地址:https://www.kaggle.com/c/titanic

"""
__author__:shuangrui Guo
__description__:
"""
import os
import sys
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score
# for dirname,_,filenames in os.walk('./data'):
#     print(dirname,filenames)
if __name__ == '__main__':
    Train_df = pd.read_csv('./data/train.csv')
    Test_df = pd.read_csv('./data/test.csv')
    gender_df = pd.read_csv('./data/gender_submission.csv')
    #创建不需要用到的属性,并在训练集与测试集中除去对应的列
    not_need_colimns=['PassengerId','Cabin','Name']
    Train_df = Train_df.drop(axis = 1,columns=not_need_colimns)
    Test_df = Test_df.drop(axis=1,columns=not_need_colimns)
    #创建labelEncoder()
    le = LabelEncoder()
    #训练集对Sex列与Embarked列进行转数值化操作
    Train_df['Sex'] = le.fit_transform(Train_df['Sex'])
    Train_df['Embarked'] = le.fit_transform(Train_df['Embarked'].astype(str))
    Train_df['Age'] = Train_df['Age'].fillna(value=Train_df['Age'].mean())
    #对测试集进行同样的处理
    Test_df['Sex'] = le.fit_transform(Test_df['Sex'])
    Test_df['Embarked'] = le.fit_transform(Test_df['Embarked'].astype(str))
    Test_df['Age'] = Test_df['Age'].fillna(value=Test_df['Age'].mean())
    #对训练集中的Ticket属性,使用str,使用split分割,
    Tickits = Train_df['Ticket'].str.split()
    tickits=[]
    for tickit in Tickits:
        if tickit[-1].isdigit():
            tickits.append(int(tickit[-1]))
        else:
            tickits.append(9999999)
    Train_df['Ticket'] = tickits
    #对测试集的Ticket属性进行处理
    Tickits = Test_df['Ticket'].str.split()
    tickits = []
    for tickit in Tickits:
        if tickit[-1].isdigit():
            tickits.append(int(tickit[-1]))
        else:
            tickits.append(9999999)
    Test_df['Ticket'] = tickits
    #判断某一列中是否有null值
    #print(Test_df['Fare'].isnull().any())
    Test_df['Fare'] = Test_df['Fare'].fillna(value=Test_df['Fare'].mean())
    #设置训练集特征与标签
    Y_Train = Train_df.loc[:,['Survived']]
    X_Train = Train_df.drop(axis = 1,columns=['Survived'])
    #设置训练集与测试集
    x_train,x_test,y_train,y_test = train_test_split(X_Train,Y_Train,random_state=42)

    #使用AdaBoost进行分类
    adc = AdaBoostClassifier()
    adc.fit(x_train,y_train)
    Y_pred = adc.predict(x_test)
    print(accuracy_score(Y_pred,y_test))
    #使用决策树进行分类
    tree = DecisionTreeClassifier()
    tree.fit(x_train,y_train)
    Y_pred = tree.predict(x_test)
    print(accuracy_score(Y_pred,y_test))

    #使用随机森林进行分类
    rfc = RandomForestClassifier(n_estimators=25,max_depth=7,random_state=42)
    rfc.fit(x_train,y_train)
    Y_pred = rfc.predict(Test_df)
    print(accuracy_score(Y_pred,gender_df.loc[:,['Survived']]))
    #对给出的test.csv文件进行预测
    submission = pd.DataFrame({
        "PassengerId":gender_df["PassengerId"],
        "Survived":Y_pred
    })
    submission.to_csv('submission.csv', index=False)


 

Logo

魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。

更多推荐