Competição Kaggle para prever e classificar sobreviventes do naufrágio do Titanic.
Link da competição com os dada train:
BIBLIOTECAS
In [1]:
import numpy as np
import pandas as pd
# Visualização dos dados
import seaborn as sns
%matplotlib inline
%pylab inline
from matplotlib import pyplot as plt
from matplotlib import style
IMPORTAR DADOS DE TESTE E TREINO
In [2]:
#visualizar
dadostrain = pd.read_csv('../input/titanic/train.csv')
teste = pd.read_csv('../input/titanic/test.csv')
train.isnull().any()
Out [2]:
PassengerId False
Survived False
Pclass False
Name False
Sex False
Age True
SibSp False
Parch False
Ticket False
Fare False
Cabin True
Embarked True
dtype: bool
AJUSTE DE VARIÁVEIS
survival: Sobrevivente
PassengerId: ID único de um passageiro
pclass: Classe de bilhetes sex: Sexo Age: Idade em anos sibsp: Número de irmãos / cônjuges a bordo do Titanic parch: Número de pais / filhos a bordo do Titanic ticket: Numero do bilhete fare: Tarifa de passageiros cabin: Número de cabine embarked: Porto de embarcação
In [3]:
train['Embarked'].fillna('S', inplace = True)
embark = {'S':0, 'C':1, 'Q':2}
train['Embarked'] = train['Embarked'].map(embark)
gender = {'female':0, 'male':1}
train['Sex'] = train['Sex'].map(gender)
train['Age'] = train['Age'].fillna(train['Age'].mean())
train['Fare'] = train['Fare'].fillna(train['Fare'].mean())
teste['Embarked'].fillna('S', inplace = True)
teste['Embarked'] = teste['Embarked'].map(embark)
teste['Sex'] = teste['Sex'].map(gender)
teste['Age'] = teste['Age'].fillna(teste['Age'].mean())
teste['Fare'] = teste['Fare'].fillna(teste['Fare'].mean())
train.head()
NOVAS VARIÁVEIS
In [4]:
for name in train['Name']:
train['title'] = train['Name'].str.extract("([A-Za-z]+)\.",expand=True)train['title'].value_counts().index
titulo = train['title'].value_counts().index
def verifica_titulo(value):
for i in range(len(titulo)):
if re.search(titulo[i], value) is not None:
return i
train['titulo'] = train['Name'].map(verifica_titulo)
teste['titulo'] = teste['Name'].map(verifica_titulo)
pylab.hist(train['titulo'])
In [5]:
def verifica_cabine(value):
cabine = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
for i in range(len(cabine)):
if re.search(cabine[i],str(value)) is not None:
return i
train['cabine'] = train['Cabin'].map(verifica_cabine)
teste['cabine'] = teste['Cabin'].map(verifica_cabine)
pylab.hist(train['cabine'])
In [6]:
ticket = []
for name in train['Ticket']:
name = name.split(' ')[0]
name = name.split('/')[0]
name = name.split('.')[0]
if name.isalpha():
ticket.append(name)
letra_ticket = pd.Series(ticket).value_counts().index
def verifica_ticket(value):
for i in range(len(letra_ticket)):
if re.search(letra_ticket[i], value) is not None:
return i
train['passagem'] = train['Ticket'].map(verifica_ticket)
teste['passagem'] = teste['Ticket'].map(verifica_ticket)
pylab.hist(train['passagem'])
In [7]:
train['familia'] = train['SibSp']+train['Parch']+1
teste['familia'] = teste['SibSp']+teste['Parch']+1
train["sozinho"] = train.familia.apply(lambda x: 1 if x == 1 else 0)
teste["sozinho"] = teste.familia.apply(lambda x: 1 if x == 1 else 0)
pylab.hist(train['sozinho'])
print(train.describe())
Out [7]:
PassengerId Survived Pclass Sex Age \
count 891.000000 891.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 0.647587 29.699118
std 257.353842 0.486592 0.836071 0.477990 13.002015
min 1.000000 0.000000 1.000000 0.000000 0.420000
25% 223.500000 0.000000 2.000000 0.000000 22.000000
50% 446.000000 0.000000 3.000000 1.000000 29.699118
75% 668.500000 1.000000 3.000000 1.000000 35.000000
max 891.000000 1.000000 3.000000 1.000000 80.000000
SibSp Parch Fare Embarked titulo cabine \
count 891.000000 891.000000 891.000000 891.000000 891.000000 203.000000
mean 0.523008 0.381594 32.204208 0.361392 0.518519 2.364532
std 1.102743 0.806057 49.693429 0.635673 1.355987 1.436829
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 7.910400 0.000000 0.000000 1.000000
50% 0.000000 0.000000 14.454200 0.000000 0.000000 2.000000
75% 1.000000 0.000000 31.000000 1.000000 1.000000 3.000000
max 8.000000 6.000000 512.329200 2.000000 15.000000 6.000000
passagem familia sozinho
count 230.000000 891.000000 891.000000
mean 1.865217 1.904602 0.602694
std 2.372350 1.613459 0.489615
min 0.000000 1.000000 0.000000
25% 0.000000 1.000000 0.000000
50% 1.000000 1.000000 1.000000
75% 2.000000 2.000000 1.000000
max 11.000000 11.000000 1.000000
MÉTODOS
In [8]:
from sklearn.model_selection import RepeatedKFold
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
variaveis = ['Pclass', 'Sex', 'Age','Embarked', 'titulo', 'passagem', 'familia' , 'sozinho']
X = train[variaveis].fillna(-1)
y = train['Survived']
print(X.columns)
#treino com validação cruzada
classifiers = ['LinSVM',
'RadSVM',
'LogisticReg',
'RForestClass',
'ABoostClass',
'XGBoostClass',
'KNClass',
'GBoostingClass',
'GaussianNB']
model = [svm.SVC(kernel='linear'),
svm.SVC(kernel='rbf'),
LogisticRegression(max_iter = 1000),
RandomForestClassifier(min_samples_leaf = 3, min_samples_split = 3, n_estimators=200, max_features='auto',oob_score=True,random_state=0,n_jobs=-1),
AdaBoostClassifier(random_state = 0),
xgb.XGBClassifier(n_estimators=100),
KNeighborsClassifier(),
GradientBoostingClassifier(random_state=0),
GaussianNB()]
methods = pd.DataFrame()
i=0
kf = RepeatedKFold(n_splits=2, n_repeats=10, random_state = 10)
for modelo in model:
resultados = []
for linhas_treino, linhas_valid in kf.split(X):
X_treino, X_valid = X.iloc[linhas_treino], X.iloc[linhas_valid]
y_treino, y_valid = y.iloc[linhas_treino], y.iloc[linhas_valid]
modelo.fit(X_treino, y_treino)
p = modelo.predict(X_valid)
acc = np.mean(y_valid == p)
resultados.append(acc)
methods[classifiers[i]] = resultados
i+=1
print(methods.mean().sort_values(ascending=False))
Out [8]:
Index(['Pclass', 'Sex', 'Age', 'Embarked', 'titulo', 'passagem', 'familia', 'sozinho'], dtype='object')
RForestClass 0.814253
GBoostingClass 0.809879
ABoostClass 0.802348
XGBoostClass 0.800787
LogisticReg 0.799543
LinSVM 0.789330
GaussianNB 0.788652
KNClass 0.725591
RadSVM 0.631193
dtype: float64
RandomForestClassifier (melhor precisao)
In [9]:
classifiers = [#'LinSVM',
#'RadSVM',
#'LogisticReg',
'RForestClass',
#'ABoostClass',
#'XGBoostClass',
#'KNClass',
#'GBoostingClass',
#'GaussianNB']
model = [#svm.SVC(kernel='linear'),
#svm.SVC(kernel='rbf'),
#LogisticRegression(max_iter = 1000),RandomForestClassifier(min_samples_leaf = 3, min_samples_split = 3, n_estimators=200, max_features='auto',oob_score=True,random_state=0,n_jobs=-1),
#AdaBoostClassifier(random_state = 0),
#xgb.XGBClassifier(n_estimators=100),
#KNeighborsClassifier(),
#GradientBoostingClassifier(random_state=0),
#GaussianNB()]
from sklearn.model_selection import train_test_split
classification = pd.DataFrame()
i=0
resultados = []
for modelo in model:
X_treino, X_valid, y_treino, y_valid = train_test_split(X, y, test_size=0.3)
modelo.fit(X_treino, y_treino)
p = modelo.predict(X_valid)
acc = np.mean(y_valid == p)
classification[classifiers[i]] = p
resultados.append(acc)
i+=1
print('accuracy:',mean(resultados))
print(classification.shape)
Out [9]:
accuracy: 0.8246268656716418
(268, 1)
TESTE DE ERROS
In [10]:
P = pd.DataFrame()
P['S'] = classification.sum(axis=1)/classification.shape[1]
peso = P['S'].mean()
def contar_maioria(value):
if value > peso:
return 1
else: return 0
P['S'] = P['S'].map(contar_maioria)
P['True'] = y_valid.reset_index(drop=True)
pylab.hist(P['S'],alpha=0.4)
pylab.hist(P['True'],alpha=0.4)
print('acertos:',np.mean(P['True'] == P['S']))
print('de:',P.shape[0])
Out [10]:
acertos: 0.8246268656716418
de: 268
TESTE
In [11]:
output = pd.DataFrame()
i=0
for modelo in model:
modelo.fit(X, y)
p = modelo.predict(teste[variaveis].fillna(-1))
output[classifiers[i]] = p
print(len(p))
i+=1
print(output.describe())
Out [11]:
418
RForestClass
count 418.000000
mean 0.368421
std 0.482954
min 0.000000
25% 0.000000
50% 0.000000
75% 1.000000
max 1.000000
コメント