J'utilise un svm pour voir si je peux prendre des données de baseball et classer les coups et estimer les home runs. Il semble que j'obtienne des résultats différents lorsque j'exécute le modèle plusieurs fois. J'ai donc créé une simulation qui exécute le modèle 100 fois, mais je ne comprends pas pourquoi et quelle est la cause de cette variation. Quelqu'un peut-il m'expliquer pourquoi ? J'ai défini random_state=42
import pandas as pd
from mlxtend.plotting import plot_decision_regions
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn import metrics
import statistics
import numpy as np
result_array = []
players = [488768, 517369, 461314, 477165, 506560, 572114, 641319, 592669, 622534, 605486, 602922, 518466, 572362, 519082, 623182, 595978, 543272]
dfSave = pd.DataFrame(columns=['Mean','Max','Min','Std', 'Accuracy', 'Precision', 'f1_score', 'Recall_Score', 'First_Name', 'Last_Name'])
for i in players:
batter = i
df = pd.read_csv('D:baseballData_2016_use.csv')
df2 = pd.read_csv('D:padres_2016_home.csv') #Team to test
dataFilter = df.loc[df['Home_Team'] == 'Orioles'] #Stadium to train model to.
dataFilter2 = df2.loc[df2['Batter_ID'] == batter] #Players to test in stadium
j = 0
while j <= 100:
predict = dataFilter2.iloc[:,[4,5]]
X =dataFilter.iloc[:,[4,5]]
y = dataFilter.iloc[:,3]
y = y.astype(np.integer)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)
svclassifier = SVC(C=4, cache_size=200, class_weight= None, coef0=0.0,
decision_function_shape='ovo', degree=3, gamma=0.001, kernel='rbf',
max_iter=-1, probability=False, random_state=42, shrinking=False,
tol=0.001, verbose=False) #defaults
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
predicted= svclassifier.predict(predict)
listDf = []
sum = 0 # print predicted home runs
for i in predicted:
if i == 1:
sum = sum + 1
result_array.append(sum)
print(sum)
j = j + 1
firstName = dataFilter2.loc[1:,'Batter_First_Name'].values
lastName = dataFilter2.loc[1:,'Batter_Last_Name'].values
listDf.append({'Mean': statistics.mean(result_array),'Max' : max(result_array),'Min' : min(result_array),'Std' : statistics.stdev(result_array),
'Accuracy' : metrics.accuracy_score(y_test, y_pred), 'Precision' : precision_score(y_test, y_pred, average="macro"), 'f1_score' :f1_score(y_test, y_pred, average="macro"),
"Recall_Score" : recall_score(y_test, y_pred, average="macro"), 'First_Name' : firstName[0],'Last_Name' : lastName[0]})
dfSave = pd.DataFrame(listDf)
dfSave.to_csv('D:test9999.csv', mode='a')
result_array = []