-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
21 changed files
with
5,743 additions
and
5,625 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
# Librerie utili per l'analisi dei dati | ||
import pandas as pd | ||
import numpy as np | ||
import matplotlib | ||
import seaborn as sns | ||
import matplotlib.pyplot as plt | ||
from numpy.random.mtrand import seed | ||
|
||
from sklearn.ensemble import RandomForestRegressor | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.ensemble import GradientBoostingRegressor | ||
from sklearn.neighbors import KNeighborsClassifier | ||
from sklearn.naive_bayes import GaussianNB | ||
from sklearn.metrics import mean_squared_error, r2_score | ||
from sklearn import tree | ||
|
||
# Configurazione dello stile dei grafici | ||
sns.set(context='notebook', style='darkgrid', palette='colorblind', font='sans-serif', font_scale=1, rc=None) | ||
matplotlib.rcParams['figure.figsize'] =[8,8] | ||
matplotlib.rcParams.update({'font.size': 15}) | ||
matplotlib.rcParams['font.family'] = 'sans-serif' | ||
|
||
# --- Preparazione dei dati --- | ||
# Caricamento del dataset | ||
|
||
def data_split(data, ratio): | ||
np.random.seed(42) | ||
shuffled = np.random.permutation(len(data)) | ||
test_set_size = int(len(data) * ratio) | ||
test_indices = shuffled[:test_set_size] | ||
train_indices = shuffled[test_set_size:] | ||
return data.iloc[train_indices], data.iloc[test_indices] | ||
|
||
if __name__== "__main__": | ||
|
||
# Read The Data | ||
covid = pd.read_csv('data/data.csv') | ||
train, test = data_split(covid, 0.2) | ||
X_train = train[['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat', | ||
'Running Nose', 'Asthma', 'Chronic Lung Disease', 'Headache', | ||
'Heart Disease', 'Diabetes', 'Hyper Tension', 'Fatigue ', | ||
'Gastrointestinal ', 'Abroad travel', 'Contact with COVID Patient', | ||
'Attended Large Gathering', 'Visited Public Exposed Places', | ||
'Family working in Public Exposed Places', 'Wearing Masks', | ||
'Sanitization from Market']].to_numpy() | ||
|
||
X_test = test[['Breathing Problem', 'Fever', 'Dry Cough', 'Sore throat', | ||
'Running Nose', 'Asthma', 'Chronic Lung Disease', 'Headache', | ||
'Heart Disease', 'Diabetes', 'Hyper Tension', 'Fatigue ', | ||
'Gastrointestinal ', 'Abroad travel', 'Contact with COVID Patient', | ||
'Attended Large Gathering', 'Visited Public Exposed Places', | ||
'Family working in Public Exposed Places', 'Wearing Masks', | ||
'Sanitization from Market']].to_numpy() | ||
|
||
Y_train = train[['COVID-19']].to_numpy().reshape(4348,) | ||
Y_test = test[['COVID-19']].to_numpy().reshape(1086,) | ||
|
||
# Questo metodo stampa le informazioni su un DataFrame, inclusi l'indice dtype e le colonne, i valori non null e l'utilizzo della memoria. | ||
# Scrittura delle info su file.txt | ||
with open('data/data_preparation/info.txt', 'w') as f: | ||
covid.info(buf=f) | ||
|
||
|
||
# Verifica dei dati mancanti | ||
missing_values=covid.isnull().sum() # valori mancanti | ||
percent_missing = covid.isnull().sum()/covid.shape[0]*100 # valori mancanti % | ||
value = { | ||
'missing_values ':missing_values, | ||
'percent_missing %':percent_missing | ||
} | ||
frame=pd.DataFrame(value) | ||
frame.to_csv('data/data_preparation/missing_value.csv') # salvataggio su un file.csv per renderlo leggibile | ||
|
||
|
||
# Genera statistiche descrittive | ||
covid.describe().to_csv("data/data_preparation/dataset_statics.csv") # salvataggio su un file.csv per renderlo leggibile | ||
|
||
# --- Visualizzazione dei dati --- | ||
# COVID-19 | ||
# sns_plot = sns.countplot(x='COVID-19', data=covid) | ||
# figure = sns_plot.get_figure() | ||
# figure.savefig('data/data_preparation/data_viz/COVID-19.png', dpi = 400) | ||
|
||
# Breathing Problem | ||
# sns_breathing = sns.countplot(x='Breathing Problem',hue='COVID-19',data=covid) | ||
# figure1 = sns_breathing.get_figure() | ||
# figure1.savefig('data/data_preparation/data_viz/BreathingProblem.png', dpi = 400) | ||
|
||
# Fever | ||
# sns_fever = sns.countplot(x='Fever', hue='COVID-19', data=covid) | ||
# figure2 = sns_fever.get_figure() | ||
# figure2.savefig('data/data_preparation/data_viz/Fever.png', dpi = 400) | ||
|
||
# Dry Cough | ||
# sns_dry = sns.countplot(x='Dry Cough',hue='COVID-19',data=covid) | ||
# figure3 = sns_dry.get_figure() | ||
# figure3.savefig('data/data_preparation/data_viz/dry.png', dpi = 400) | ||
|
||
# Sore Throat | ||
# sns_sore = sns.countplot(x='Sore throat',hue='COVID-19',data=covid) | ||
# figure4 = sns_sore.get_figure() | ||
# figure4.savefig('data/data_preparation/data_viz/sore.png', dpi = 400) | ||
|
||
|
||
|
||
|
||
|
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
18 changes: 9 additions & 9 deletions
18
dataset_statics.csv → data/data_preparation/dataset_statics.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,9 @@ | ||
,Breathing Problem,Fever,Dry Cough,Sore throat,Running Nose,Asthma,Chronic Lung Disease,Headache,Heart Disease,Diabetes,Hyper Tension,Fatigue ,Gastrointestinal ,Abroad travel,Contact with COVID Patient,Attended Large Gathering,Visited Public Exposed Places,Family working in Public Exposed Places,Wearing Masks,Sanitization from Market,COVID-19 | ||
count,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0 | ||
mean,0.6661759293338241,0.7863452337136547,0.7926021347073978,0.7274567537725433,0.5432462274567538,0.4626426205373574,0.47202797202797203,0.5034965034965035,0.4642988590357011,0.4762605815237394,0.49006256900993744,0.5191387559808612,0.4694516010305484,0.45104895104895104,0.5016562384983437,0.4619065145380935,0.5189547294810453,0.41626794258373206,0.0,0.0,0.8065881486934119 | ||
std,0.4716211327739574,0.4099235665965471,0.40548026751388566,0.44530878904756294,0.49817209342489355,0.4986483574853146,0.4992628934027898,0.5000337861645077,0.4987696953792399,0.4994820831364206,0.4999472415983911,0.49967955300634165,0.4991118498366407,0.49764381725600054,0.5000432695898289,0.4985926537423099,0.4996865689546946,0.49298444924353374,0.0,0.0,0.39500939378839356 | ||
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 | ||
25%,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0 | ||
50%,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0 | ||
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0 | ||
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0 | ||
,Breathing Problem,Fever,Dry Cough,Sore throat,Running Nose,Asthma,Chronic Lung Disease,Headache,Heart Disease,Diabetes,Hyper Tension,Fatigue ,Gastrointestinal ,Abroad travel,Contact with COVID Patient,Attended Large Gathering,Visited Public Exposed Places,Family working in Public Exposed Places,Wearing Masks,Sanitization from Market,COVID-19 | ||
count,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0 | ||
mean,0.6661759293338241,0.7863452337136547,0.7926021347073978,0.7274567537725433,0.5432462274567538,0.4626426205373574,0.47202797202797203,0.5034965034965035,0.4642988590357011,0.4762605815237394,0.49006256900993744,0.5191387559808612,0.4694516010305484,0.45104895104895104,0.5016562384983437,0.4619065145380935,0.5189547294810453,0.41626794258373206,0.0,0.0,0.8065881486934119 | ||
std,0.4716211327739574,0.4099235665965471,0.40548026751388566,0.44530878904756294,0.49817209342489355,0.4986483574853146,0.4992628934027898,0.5000337861645077,0.4987696953792399,0.4994820831364206,0.4999472415983911,0.49967955300634165,0.4991118498366407,0.49764381725600054,0.5000432695898289,0.4985926537423099,0.4996865689546946,0.49298444924353374,0.0,0.0,0.39500939378839356 | ||
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 | ||
25%,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0 | ||
50%,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0 | ||
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0 | ||
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
<class 'pandas.core.frame.DataFrame'> | ||
RangeIndex: 5434 entries, 0 to 5433 | ||
Data columns (total 21 columns): | ||
# Column Non-Null Count Dtype | ||
--- ------ -------------- ----- | ||
0 Breathing Problem 5434 non-null int64 | ||
1 Fever 5434 non-null int64 | ||
2 Dry Cough 5434 non-null int64 | ||
3 Sore throat 5434 non-null int64 | ||
4 Running Nose 5434 non-null int64 | ||
5 Asthma 5434 non-null int64 | ||
6 Chronic Lung Disease 5434 non-null int64 | ||
7 Headache 5434 non-null int64 | ||
8 Heart Disease 5434 non-null int64 | ||
9 Diabetes 5434 non-null int64 | ||
10 Hyper Tension 5434 non-null int64 | ||
11 Fatigue 5434 non-null int64 | ||
12 Gastrointestinal 5434 non-null int64 | ||
13 Abroad travel 5434 non-null int64 | ||
14 Contact with COVID Patient 5434 non-null int64 | ||
15 Attended Large Gathering 5434 non-null int64 | ||
16 Visited Public Exposed Places 5434 non-null int64 | ||
17 Family working in Public Exposed Places 5434 non-null int64 | ||
18 Wearing Masks 5434 non-null int64 | ||
19 Sanitization from Market 5434 non-null int64 | ||
20 COVID-19 5434 non-null int64 | ||
dtypes: int64(21) | ||
memory usage: 891.6 KB |
44 changes: 22 additions & 22 deletions
44
Missingvalue.csv → data/data_preparation/missing_value.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,22 @@ | ||
,missing_values ,percent_missing % | ||
Breathing Problem,0,0.0 | ||
Fever,0,0.0 | ||
Dry Cough,0,0.0 | ||
Sore throat,0,0.0 | ||
Running Nose,0,0.0 | ||
Asthma,0,0.0 | ||
Chronic Lung Disease,0,0.0 | ||
Headache,0,0.0 | ||
Heart Disease,0,0.0 | ||
Diabetes,0,0.0 | ||
Hyper Tension,0,0.0 | ||
Fatigue ,0,0.0 | ||
Gastrointestinal ,0,0.0 | ||
Abroad travel,0,0.0 | ||
Contact with COVID Patient,0,0.0 | ||
Attended Large Gathering,0,0.0 | ||
Visited Public Exposed Places,0,0.0 | ||
Family working in Public Exposed Places,0,0.0 | ||
Wearing Masks,0,0.0 | ||
Sanitization from Market,0,0.0 | ||
COVID-19,0,0.0 | ||
,missing_values ,percent_missing % | ||
Breathing Problem,0,0.0 | ||
Fever,0,0.0 | ||
Dry Cough,0,0.0 | ||
Sore throat,0,0.0 | ||
Running Nose,0,0.0 | ||
Asthma,0,0.0 | ||
Chronic Lung Disease,0,0.0 | ||
Headache,0,0.0 | ||
Heart Disease,0,0.0 | ||
Diabetes,0,0.0 | ||
Hyper Tension,0,0.0 | ||
Fatigue ,0,0.0 | ||
Gastrointestinal ,0,0.0 | ||
Abroad travel,0,0.0 | ||
Contact with COVID Patient,0,0.0 | ||
Attended Large Gathering,0,0.0 | ||
Visited Public Exposed Places,0,0.0 | ||
Family working in Public Exposed Places,0,0.0 | ||
Wearing Masks,0,0.0 | ||
Sanitization from Market,0,0.0 | ||
COVID-19,0,0.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
7ac8a22
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Model score
Training variance explained: 96.9%
Test variance explained: 97.2%
Data visual
7ac8a22
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
7ac8a22
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Model score
Training variance explained: 96.9%
Test variance explained: 97.2%
Data visual
7ac8a22
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.