# Import des librairies
import pandas as pd  # pandas 
import numpy as np
import matplotlib.pyplot as plt # Pour les graphiques
import seaborn as sns # seconde librairie de visualisation
# This is to test results
from test_helper import Test

g1800s = pd.read_csv("g1800s.csv")

# Afficher les premières lignes
g1800s.head()

g1800s.shape
g1800s.size

19897

g1800s.describe()

Nombre_de_lignes = g1800s.shape[0]
Nombre_de_colonnes = g1800s.shape[1]
Type_de_la_variable_1803 = (g1800s["1800"]).dtypes## FILL HERE ## #choisir entre float, string, object, int
Nombre_de_variables_qualitatives = g1800s.value_counts()

Test.assertEqualsHashed(Nombre_de_lignes, '61188f24396807ba7ca38919a158766de935852e')
Test.assertEqualsHashed(Nombre_de_colonnes,'dbc0f004854457f59fb16ab863a3a1722cef553f')
Test.assertEqualsHashed(Type_de_la_variable_1803,'1c737bef1c20a191fa97fbb9558e4f5cc67ac29d')
Test.assertEqualsHashed(Nombre_de_variables_qualitatives, '356a192b7913b04c54574d18c28d46e6395428ab')

1 test passed.
1 test passed.
1 test failed. 
1 test failed.

Nb_donnees_manquantes = g1800s.isna().sum().sum()
Nombre_données_totales =g1800s.size
Pourcentage_donnees_manquantes = Nb_donnees_manquantes/Nombre_données_totales*100

Test.assertEqualsHashed(Nb_donnees_manquantes,'b124524c4b1ade45d1deecbcdef614fadb3ec205')
Test.assertTrue(np.isclose(Pourcentage_donnees_manquantes, 5.5, rtol=0.01))

1 test passed.
1 test passed.

# scatter plot
g1800s.plot(kind='scatter', x='1800', y='1899')

# Labels des axes
plt.xlabel('Life Expectancy by Country in 1800')
plt.ylabel('Life Expectancy by Country in 1899')

# Limites des axes
plt.xlim(20, 55)
plt.ylim(20, 55)

# show
plt.show()

sns.scatterplot(data=____, x='_____', y='_____')

sns.lmplot(data=g1800s, x='1800', y='1899')

<seaborn.axisgrid.FacetGrid at 0x131183f50>

# 1.5.1
(g1800s.dropna().loc[:,'1800'] >= 0).all().all()

True

#2 (solution --> comprendre)
assert g1800s['Life expectancy'].value_counts().max() == 1

g1900s = pd.read_csv("g1900s.csv")
g2000s = pd.read_csv("g2000s.csv")

Test.assertEqualsHashed(g2000s.shape, 'dd7e258e5680b4e7af857c81c4beae957c61f96d')

1 test passed.

g1900s.shape

(197, 101)

g2000s.shape

(197, 24)

g2000s.columns

Index(['Life expectancy', '2000', '2001', '2002', '2003', '2004', '2005',
       '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
       '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022'],
      dtype='object')

# Set a common index for all dataframes
for df in [g1800s, g1900s, g2000s]:
    df.set_index('Life expectancy', inplace=True)

# Concatenate the DataFrames row-wise
gapminder = pd.concat([g1800s, g1900s, g2000s], axis=1, sort=True)
print(gapminder.head()
      )

                  1800   1801   1802   1803   1804   1805   1806   1807  \
Life expectancy                                                           
Afghanistan      28.21  28.20  28.19  28.18  28.17  28.16  28.15  28.14   
Albania          35.40  35.40  35.40  35.40  35.40  35.40  35.40  35.40   
Algeria          28.82  28.82  28.82  28.82  28.82  28.82  28.82  28.82   
Andorra            NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
Angola           26.98  26.98  26.98  26.98  26.98  26.98  26.98  26.98   

                  1808   1809  ...   2013   2014   2015   2016   2017   2018  \
Life expectancy                ...                                             
Afghanistan      28.13  28.12  ...  61.93  61.93  61.91  62.03  62.90  62.73   
Albania          35.40  35.40  ...  78.28  78.21  78.14  78.23  78.33  78.44   
Algeria          28.82  28.82  ...  75.12  75.27  75.44  75.71  75.92  76.02   
Andorra            NaN    NaN  ...  81.95  81.97  82.01  82.06  82.11  82.14   
Angola           26.98  26.98  ...  62.13  63.01  63.52  63.87  64.24  64.63   

                  2019   2020   2021   2022  
Life expectancy                              
Afghanistan      63.33  63.39  63.98  64.30  
Albania          78.50  77.90  78.73  78.84  
Algeria          76.20  76.20  76.56  76.74  
Andorra          82.19    NaN    NaN    NaN  
Angola           65.08  65.20  65.78  66.11  

[5 rows x 223 columns]

Test.assertEqualsHashed(gapminder.shape,'fe67fcd9cfc09ade258d010684f9d69f7ff014b0')

1 test passed.

print(gapminder.size, gapminder.head(5))

43931                   1800   1801   1802   1803   1804   1805   1806   1807  \
Life expectancy                                                           
Afghanistan      28.21  28.20  28.19  28.18  28.17  28.16  28.15  28.14   
Albania          35.40  35.40  35.40  35.40  35.40  35.40  35.40  35.40   
Algeria          28.82  28.82  28.82  28.82  28.82  28.82  28.82  28.82   
Andorra            NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
Angola           26.98  26.98  26.98  26.98  26.98  26.98  26.98  26.98   

                  1808   1809  ...   2013   2014   2015   2016   2017   2018  \
Life expectancy                ...                                             
Afghanistan      28.13  28.12  ...  61.93  61.93  61.91  62.03  62.90  62.73   
Albania          35.40  35.40  ...  78.28  78.21  78.14  78.23  78.33  78.44   
Algeria          28.82  28.82  ...  75.12  75.27  75.44  75.71  75.92  76.02   
Andorra            NaN    NaN  ...  81.95  81.97  82.01  82.06  82.11  82.14   
Angola           26.98  26.98  ...  62.13  63.01  63.52  63.87  64.24  64.63   

                  2019   2020   2021   2022  
Life expectancy                              
Afghanistan      63.33  63.39  63.98  64.30  
Albania          78.50  77.90  78.73  78.84  
Algeria          76.20  76.20  76.56  76.74  
Andorra          82.19    NaN    NaN    NaN  
Angola           65.08  65.20  65.78  66.11  

[5 rows x 223 columns]

print(gapminder.columns)

Index(['1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808',
       '1809',
       ...
       '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021',
       '2022'],
      dtype='object', length=223)

# Reset index (index as line numbers + individual column "Life expectancy")
gapminder = gapminder.reset_index()
gapminder.head()

# Entrer les paramètres 
gapminder_melt = pd.melt(frame= gapminder, id_vars="Life expectancy")
gapminder_melt.head()

gapminder_melt.columns = ['country', 'year','life_expectancy']  # Renommer les colonnes,
gapminder_melt.head(10)

Test.assertEquals(gapminder_melt.loc[4, 'life_expectancy'], 26.98)

1 test passed.

gapminder = gapminder_melt

gapminder.dtypes

country             object
year                object
life_expectancy    float64
dtype: object

gapminder.year =gapminder.year.astype(int)

Test.assertEqualsHashed(gapminder.country.dtype, '1615307cc4523f183e777df67f168c86908e8007')
Test.assertEqualsHashed(gapminder.year.dtype, '3cf12f96228a3fa41a25040bdcc6eac3659e7844')

1 test passed.
1 test passed.

# Create the series of countries: countries
countries = pd.Series(gapminder.country)
# Drop all the duplicates from countries
countries = countries.drop_duplicates()

# Write the regular expression: pattern
pattern = '^[A-Za-z\.\s]*$' ## FILL HERE ##

# Create the Boolean vector: mask
mask = countries.str.contains(pattern)

# Invert the mask: mask_inverse
mask_inverse = ~mask

# Subset countries using mask_inverse: invalid_countries
invalid_countries = countries[mask_inverse]

# Print invalid_countries
print(invalid_countries)

41     Cote d'Ivoire
69     Guinea-Bissau
176      Timor-Leste
Name: country, dtype: object

<>:7: SyntaxWarning: invalid escape sequence '\.'
<>:7: SyntaxWarning: invalid escape sequence '\.'
/var/folders/pd/3rcrcz4d0f9cd_p_f3x_ll2r0000gn/T/ipykernel_27805/1270135150.py:7: SyntaxWarning: invalid escape sequence '\.'
  pattern = '^[A-Za-z\.\s]*$' ## FILL HERE ##

gapminder.life_expectancy.plot(kind='hist')

<Axes: ylabel='Frequency'>

sns.displot(gapminder.life_expectancy.dropna(), kind='hist', kde=True)#, element='step')

<seaborn.axisgrid.FacetGrid at 0x1387f9e20>

# Read standard matplotlib colors in colors
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']

sns.histplot(gapminder[gapminder['year']<1900].life_expectancy.dropna(), kde=True, stat='density', color=colors[0])
idx = (gapminder['year']>=1900) & (gapminder['year']<2000)
sns.histplot(gapminder[idx].life_expectancy.dropna(),  kde=True, stat='density', color=colors[1])
sns.histplot(gapminder[gapminder['year']>1999].life_expectancy.dropna(),  kde=True, stat='density', color=colors[2])

<Axes: xlabel='life_expectancy', ylabel='Density'>

sns.kdeplot(gapminder[gapminder['year']<1900].life_expectancy.dropna(),  fill=True)
idx = (gapminder['year']>=1900) & (gapminder['year']<2000)
sns.kdeplot(gapminder[idx].life_expectancy.dropna(), fill=True)
sns.kdeplot(gapminder[gapminder['year']>1999].life_expectancy.dropna(), fill=True)

<Axes: xlabel='life_expectancy', ylabel='Density'>

gapminder['siecle'] = pd.cut(gapminder['year'], [1799, 1899, 1999, 2100], labels=['19e', '20e', '21e' ])

# On extrait la liste des couleurs de matplotlib
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']

k = 0
groups = gapminder.dropna().groupby('siecle')
for l,group in groups:
    # fill here
    k = k + 1
plt.legend()

<matplotlib.legend.Legend at 0x7efd3cb89be0>

#%matplotlib widget
# Group gapminder: gapminder_agg
gapminder_agg = gapminder.groupby('year')['life_expectancy'].mean()

plt.figure(figsize=(10,6))

# Create a line plot of life expectancy per year
gapminder_agg.plot()

# Add title and specify axis labels
plt.title('Life expectancy over the years')
plt.ylabel('Life expectancy')
plt.xlabel('Year')

# Display the plots
plt.tight_layout()
plt.show()

	1800	1801	1802	1803	1804	1805	1806	1807	1808	1809	...	1890	1891	1892	1893	1894	1895	1896	1897	1898	1899
count	186.000000	186.000000	186.000000	186.000000	186.000000	186.000000	186.000000	186.000000	186.000000	186.000000	...	186.000000	186.000000	186.000000	186.000000	186.000000	186.000000	186.000000	186.000000	186.000000	186.000000
mean	31.504301	31.464194	31.479946	31.386935	31.461398	31.586989	31.644731	31.598441	31.386237	31.314570	...	33.115108	33.347688	33.382688	33.473011	33.738387	33.838925	33.959624	34.064301	34.130108	34.126720
std	3.808172	3.799967	3.930641	3.956290	3.929598	4.004460	4.101342	3.971748	4.079098	4.032086	...	5.887144	5.661333	5.749977	5.946566	5.686913	5.941806	6.228897	6.262405	6.199341	6.186988
min	23.390000	23.390000	23.390000	19.600000	23.390000	23.390000	23.390000	23.390000	12.480000	13.430000	...	4.000000	8.000000	14.000000	8.140000	21.680000	21.490000	19.510000	18.520000	20.670000	19.860000
25%	29.025000	28.962500	28.912500	28.912500	28.962500	29.025000	29.025000	29.025000	28.962500	28.840000	...	30.467500	30.645000	30.422500	30.340000	30.520000	30.317500	30.280000	30.470000	30.305000	30.327500
50%	31.750000	31.650000	31.550000	31.500000	31.550000	31.650000	31.750000	31.750000	31.550000	31.500000	...	32.915000	33.040000	32.985000	33.005000	33.100000	33.175000	33.335000	33.480000	33.545000	33.630000
75%	33.875000	33.895000	33.875000	33.675000	33.775000	33.875000	33.975000	33.975000	33.775000	33.675000	...	35.305000	35.432500	35.520000	35.587500	35.595000	35.720000	35.765000	35.982500	36.135000	36.262500
max	42.850000	40.300000	44.370000	44.840000	42.830000	44.270000	45.820000	43.560000	43.550000	41.740000	...	50.520000	51.130000	52.830000	52.690000	52.150000	54.210000	53.920000	54.200000	54.750000	51.700000

TP DATASCIENCE -- INTRODUCTION

Table of Contents

Lecture des données et première exploration¶

Combinons des tables¶

Tidyfication¶

Regroupement et exploitation¶

	Life expectancy	1800	1801	1802	1803	1804	1805	1806	1807	1808	...	1890	1891	1892	1893	1894	1895	1896	1897	1898	1899
0	Afghanistan	28.21	28.20	28.19	28.18	28.17	28.16	28.15	28.14	28.13	...	31.39	31.59	31.78	31.97	32.16	32.36	32.55	32.74	32.93	33.13
1	Albania	35.40	35.40	35.40	35.40	35.40	35.40	35.40	35.40	35.40	...	35.08	35.06	35.04	35.03	35.01	35.00	34.98	34.96	34.95	34.93
2	Algeria	28.82	28.82	28.82	28.82	28.82	28.82	28.82	28.82	28.82	...	31.05	31.16	31.28	31.39	31.50	31.61	31.72	31.83	31.94	32.06
3	Andorra	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	Angola	26.98	26.98	26.98	26.98	26.98	26.98	26.98	26.98	26.98	...	30.76	30.95	31.14	31.33	31.51	31.70	31.89	32.08	32.27	32.46