Advanced Python Pandas (Merging, Apply, Groupby, Pivot, Date)

Original Source: https://www.coursera.org/specializations/data-science-python

Merging Dataframes

import pandas as pd
import numpy as np

staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR'},
                         {'Name': 'Sally', 'Role': 'Course liasion'},
                         {'Name': 'James', 'Role': 'Grader'}])
staff_df = staff_df.set_index('Name')
staff_df.head()
Role
Name
Kelly Director of HR
Sally Course liasion
James Grader
student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business'},
                           {'Name': 'Mike', 'School': 'Law'},
                           {'Name': 'Sally', 'School': 'Engineering'}])
student_df = student_df.set_index('Name')
student_df.head()
School
Name
James Business
Mike Law
Sally Engineering
pd.merge(staff_df, student_df, how='outer', left_index=True, right_index=True)
Role School
Name
James Grader Business
Kelly Director of HR NaN
Mike NaN Law
Sally Course liasion Engineering
pd.merge(staff_df, student_df, how='inner', left_index=True, right_index=True)
Role School
Name
Sally Course liasion Engineering
James Grader Business
pd.merge(staff_df, student_df, how='left', left_index=True, right_index=True)
Role School
Name
Kelly Director of HR NaN
Sally Course liasion Engineering
James Grader Business
pd.merge(staff_df, student_df, how='right', left_index=True, right_index=True)
Role School
Name
James Grader Business
Mike NaN Law
Sally Course liasion Engineering
staff_df = staff_df.reset_index()
student_df = student_df.reset_index()
pd.merge(staff_df, student_df, how='left', left_on='Name', right_on='Name')
Name Role School
0 Kelly Director of HR NaN
1 Sally Course liasion Engineering
2 James Grader Business
staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR', 'Location': 'State Street'},
                         {'Name': 'Sally', 'Role': 'Course liasion', 'Location': 'Washington Avenue'},
                         {'Name': 'James', 'Role': 'Grader', 'Location': 'Washington Avenue'}])
student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business', 'Location': '1024 Billiard Avenue'},
                           {'Name': 'Mike', 'School': 'Law', 'Location': 'Fraternity House #22'},
                           {'Name': 'Sally', 'School': 'Engineering', 'Location': '512 Wilson Crescent'}])
pd.merge(staff_df, student_df, how='left', left_on='Name', right_on='Name')
Location_x Name Role Location_y School
0 State Street Kelly Director of HR NaN NaN
1 Washington Avenue Sally Course liasion 512 Wilson Crescent Engineering
2 Washington Avenue James Grader 1024 Billiard Avenue Business
staff_df = pd.DataFrame([{'First Name': 'Kelly', 'Last Name': 'Desjardins', 'Role': 'Director of HR'},
                         {'First Name': 'Sally', 'Last Name': 'Brooks', 'Role': 'Course liasion'},
                         {'First Name': 'James', 'Last Name': 'Wilde', 'Role': 'Grader'}])
student_df = pd.DataFrame([{'First Name': 'James', 'Last Name': 'Hammond', 'School': 'Business'},
                           {'First Name': 'Mike', 'Last Name': 'Smith', 'School': 'Law'},
                           {'First Name': 'Sally', 'Last Name': 'Brooks', 'School': 'Engineering'}])
pd.merge(staff_df, student_df, how='inner', left_on=['First Name','Last Name'], right_on=['First Name','Last Name'])
First Name Last Name Role School
0 Sally Brooks Course liasion Engineering

‘Apply’

df = pd.read_csv('census.csv')
df.head()
SUMLEV REGION DIVISION STATE COUNTY STNAME CTYNAME CENSUS2010POP ESTIMATESBASE2010 POPESTIMATE2010 ... RDOMESTICMIG2011 RDOMESTICMIG2012 RDOMESTICMIG2013 RDOMESTICMIG2014 RDOMESTICMIG2015 RNETMIG2011 RNETMIG2012 RNETMIG2013 RNETMIG2014 RNETMIG2015
0 40 3 6 1 0 Alabama Alabama 4779736 4780127 4785161 ... 0.002295 -0.193196 0.381066 0.582002 -0.467369 1.030015 0.826644 1.383282 1.724718 0.712594
1 50 3 6 1 1 Alabama Autauga County 54571 54571 54660 ... 7.242091 -2.915927 -3.012349 2.265971 -2.530799 7.606016 -2.626146 -2.722002 2.592270 -2.187333
2 50 3 6 1 3 Alabama Baldwin County 182265 182265 183193 ... 14.832960 17.647293 21.845705 19.243287 17.197872 15.844176 18.559627 22.727626 20.317142 18.293499
3 50 3 6 1 5 Alabama Barbour County 27457 27457 27341 ... -4.728132 -2.500690 -7.056824 -3.904217 -10.543299 -4.874741 -2.758113 -7.167664 -3.978583 -10.543299
4 50 3 6 1 7 Alabama Bibb County 22915 22919 22861 ... -5.527043 -5.068871 -6.201001 -0.177537 0.177258 -5.088389 -4.363636 -5.403729 0.754533 1.107861

5 rows × 100 columns

df = df[df['SUMLEV']==50]
df.set_index(['STNAME','CTYNAME'], inplace=True)
df.head()
SUMLEV REGION DIVISION STATE COUNTY CENSUS2010POP ESTIMATESBASE2010 POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 ... RDOMESTICMIG2011 RDOMESTICMIG2012 RDOMESTICMIG2013 RDOMESTICMIG2014 RDOMESTICMIG2015 RNETMIG2011 RNETMIG2012 RNETMIG2013 RNETMIG2014 RNETMIG2015
STNAME CTYNAME
Alabama Autauga County 50 3 6 1 1 54571 54571 54660 55253 55175 ... 7.242091 -2.915927 -3.012349 2.265971 -2.530799 7.606016 -2.626146 -2.722002 2.592270 -2.187333
Baldwin County 50 3 6 1 3 182265 182265 183193 186659 190396 ... 14.832960 17.647293 21.845705 19.243287 17.197872 15.844176 18.559627 22.727626 20.317142 18.293499
Barbour County 50 3 6 1 5 27457 27457 27341 27226 27159 ... -4.728132 -2.500690 -7.056824 -3.904217 -10.543299 -4.874741 -2.758113 -7.167664 -3.978583 -10.543299
Bibb County 50 3 6 1 7 22915 22919 22861 22733 22642 ... -5.527043 -5.068871 -6.201001 -0.177537 0.177258 -5.088389 -4.363636 -5.403729 0.754533 1.107861
Blount County 50 3 6 1 9 57322 57322 57373 57711 57776 ... 1.807375 -1.177622 -1.748766 -2.062535 -1.369970 1.859511 -0.848580 -1.402476 -1.577232 -0.884411

5 rows × 98 columns

# find min and max of following values for each index.

rows = ['POPESTIMATE2010',
        'POPESTIMATE2011',
        'POPESTIMATE2012',
        'POPESTIMATE2013',
        'POPESTIMATE2014',
        'POPESTIMATE2015']
result = df.apply(lambda x: pd.Series({'min': x[rows].min(), 'max': x[rows].max()}), axis=1)
result.head()
min max
STNAME CTYNAME
Alabama Autauga County 54660.0 55347.0
Baldwin County 183193.0 203709.0
Barbour County 26489.0 27341.0
Bibb County 22512.0 22861.0
Blount County 57373.0 57776.0

Group by

df = pd.read_csv('census.csv')
df = df[df['SUMLEV']==50]
df.head()
SUMLEV REGION DIVISION STATE COUNTY STNAME CTYNAME CENSUS2010POP ESTIMATESBASE2010 POPESTIMATE2010 ... RDOMESTICMIG2011 RDOMESTICMIG2012 RDOMESTICMIG2013 RDOMESTICMIG2014 RDOMESTICMIG2015 RNETMIG2011 RNETMIG2012 RNETMIG2013 RNETMIG2014 RNETMIG2015
1 50 3 6 1 1 Alabama Autauga County 54571 54571 54660 ... 7.242091 -2.915927 -3.012349 2.265971 -2.530799 7.606016 -2.626146 -2.722002 2.592270 -2.187333
2 50 3 6 1 3 Alabama Baldwin County 182265 182265 183193 ... 14.832960 17.647293 21.845705 19.243287 17.197872 15.844176 18.559627 22.727626 20.317142 18.293499
3 50 3 6 1 5 Alabama Barbour County 27457 27457 27341 ... -4.728132 -2.500690 -7.056824 -3.904217 -10.543299 -4.874741 -2.758113 -7.167664 -3.978583 -10.543299
4 50 3 6 1 7 Alabama Bibb County 22915 22919 22861 ... -5.527043 -5.068871 -6.201001 -0.177537 0.177258 -5.088389 -4.363636 -5.403729 0.754533 1.107861
5 50 3 6 1 9 Alabama Blount County 57322 57322 57373 ... 1.807375 -1.177622 -1.748766 -2.062535 -1.369970 1.859511 -0.848580 -1.402476 -1.577232 -0.884411

5 rows × 100 columns

average_census2010pop = df.groupby('STNAME').agg({'CENSUS2010POP': np.average})
average_census2010pop.head()
CENSUS2010POP
STNAME
Alabama 71339.343284
Alaska 24490.724138
Arizona 426134.466667
Arkansas 38878.906667
California 642309.586207
avg_sum_census2010pop = (df.set_index('STNAME').groupby(level=0)['CENSUS2010POP'].agg({'avg': np.average, 'sum': np.sum}))
avg_sum_census2010pop.head()
C:\Users\vnfma\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
  if __name__ == '__main__':
avg sum
STNAME
Alabama 71339.343284 4779736
Alaska 24490.724138 710231
Arizona 426134.466667 6392017
Arkansas 38878.906667 2915918
California 642309.586207 37253956
avg_sum_popestimate = (df.set_index('STNAME').groupby(level=0)['POPESTIMATE2010','POPESTIMATE2011'].agg({'avg': np.average, 'sum': np.sum}))
avg_sum_popestimate.head()
avg sum
POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2010 POPESTIMATE2011
STNAME
Alabama 71420.313433 71658.328358 4785161 4801108
Alaska 24621.413793 24921.379310 714021 722720
Arizona 427213.866667 431248.800000 6408208 6468732
Arkansas 38965.253333 39180.506667 2922394 2938538
California 643691.017241 650000.586207 37334079 37700034

Scales

df = pd.DataFrame(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D'],
                  index=['excellent', 'excellent', 'excellent', 'good', 'good', 'good', 'ok', 'ok', 'ok', 'poor', 'poor'])
df.rename(columns={0: 'Grades'}, inplace=True)
df['Grades'].head()
excellent    A+
excellent     A
excellent    A-
good         B+
good          B
Name: Grades, dtype: object
df['Grades'].astype('category').head()
excellent    A+
excellent     A
excellent    A-
good         B+
good          B
Name: Grades, dtype: category
Categories (11, object): [A, A+, A-, B, ..., C+, C-, D, D+]
grades = df['Grades'].astype('category',
                             categories=['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+'],
                             ordered=True)
grades.head()
C:\Users\vnfma\Anaconda3\lib\site-packages\ipykernel\__main__.py:3: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  app.launch_new_instance()





excellent    A+
excellent     A
excellent    A-
good         B+
good          B
Name: Grades, dtype: category
Categories (11, object): [D < D+ < C- < C ... B+ < A- < A < A+]
grades > 'C'
excellent     True
excellent     True
excellent     True
good          True
good          True
good          True
ok            True
ok           False
ok           False
poor         False
poor         False
Name: Grades, dtype: bool
df = pd.read_csv('census.csv')
df = df[df['SUMLEV']==50]
df = df.set_index('STNAME').groupby(level=0)['CENSUS2010POP'].agg({'avg': np.average})
scaled_df = pd.cut(df['avg'],10)
scaled_df.head()
C:\Users\vnfma\Anaconda3\lib\site-packages\ipykernel\__main__.py:3: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
  app.launch_new_instance()





STNAME
Alabama         (11706.087, 75333.413]
Alaska          (11706.087, 75333.413]
Arizona       (390320.176, 453317.529]
Arkansas        (11706.087, 75333.413]
California    (579312.234, 642309.586]
Name: avg, dtype: category
Categories (10, interval[float64]): [(11706.087, 75333.413] < (75333.413, 138330.766] < (138330.766, 201328.118] < (201328.118, 264325.471] ... (390320.176, 453317.529] < (453317.529, 516314.881] < (516314.881, 579312.234] < (579312.234, 642309.586]]

Pivot Tables

df = pd.read_csv('cars.csv')
df.head()
YEAR Make Model Size (kW) Unnamed: 5 TYPE CITY (kWh/100 km) HWY (kWh/100 km) COMB (kWh/100 km) CITY (Le/100 km) HWY (Le/100 km) COMB (Le/100 km) (g/km) RATING (km) TIME (h)
0 2012 MITSUBISHI i-MiEV SUBCOMPACT 49 A1 B 16.9 21.4 18.7 1.9 2.4 2.1 0 NaN 100 7
1 2012 NISSAN LEAF MID-SIZE 80 A1 B 19.3 23.0 21.1 2.2 2.6 2.4 0 NaN 117 7
2 2013 FORD FOCUS ELECTRIC COMPACT 107 A1 B 19.0 21.1 20.0 2.1 2.4 2.2 0 NaN 122 4
3 2013 MITSUBISHI i-MiEV SUBCOMPACT 49 A1 B 16.9 21.4 18.7 1.9 2.4 2.1 0 NaN 100 7
4 2013 NISSAN LEAF MID-SIZE 80 A1 B 19.3 23.0 21.1 2.2 2.6 2.4 0 NaN 117 7
df.pivot_table(values='(kW)', index='YEAR', columns='Make', aggfunc=np.mean)
Make BMW CHEVROLET FORD KIA MITSUBISHI NISSAN SMART TESLA
YEAR
2012 NaN NaN NaN NaN 49.0 80.0 NaN NaN
2013 NaN NaN 107.0 NaN 49.0 80.0 35.0 280.000000
2014 NaN 104.0 107.0 NaN 49.0 80.0 35.0 268.333333
2015 125.0 104.0 107.0 81.0 49.0 80.0 35.0 320.666667
2016 125.0 104.0 107.0 81.0 49.0 80.0 35.0 409.700000
df.pivot_table(values='(kW)', index='YEAR', columns='Make', aggfunc=[np.mean,np.min], margins=True)
mean amin
Make BMW CHEVROLET FORD KIA MITSUBISHI NISSAN SMART TESLA All BMW CHEVROLET FORD KIA MITSUBISHI NISSAN SMART TESLA All
YEAR
2012 NaN NaN NaN NaN 49.0 80.0 NaN NaN 64.500000 NaN NaN NaN NaN 49.0 80.0 NaN NaN 49
2013 NaN NaN 107.0 NaN 49.0 80.0 35.0 280.000000 158.444444 NaN NaN 107.0 NaN 49.0 80.0 35.0 270.0 35
2014 NaN 104.0 107.0 NaN 49.0 80.0 35.0 268.333333 135.000000 NaN 104.0 107.0 NaN 49.0 80.0 35.0 225.0 35
2015 125.0 104.0 107.0 81.0 49.0 80.0 35.0 320.666667 181.428571 125.0 104.0 107.0 81.0 49.0 80.0 35.0 280.0 35
2016 125.0 104.0 107.0 81.0 49.0 80.0 35.0 409.700000 252.263158 125.0 104.0 107.0 81.0 49.0 80.0 35.0 283.0 35
All 125.0 104.0 107.0 81.0 49.0 80.0 35.0 345.478261 190.622642 125.0 104.0 107.0 81.0 49.0 80.0 35.0 225.0 35

Date Functionality in Pandas

Timestamp

pd.Timestamp('9/1/2016 10:05AM')
Timestamp('2016-09-01 10:05:00')

Period

pd.Period('1/2016')
Period('2016-01', 'M')
pd.Period('3/5/2016')
Period('2016-03-05', 'D')

DatetimeIndex

t1 = pd.Series(list('abc'), [pd.Timestamp('2016-09-01'), pd.Timestamp('2016-09-02'), pd.Timestamp('2016-09-03')])
t1
2016-09-01    a
2016-09-02    b
2016-09-03    c
dtype: object
type(t1.index)
pandas.core.indexes.datetimes.DatetimeIndex

PeriodIndex

t2 = pd.Series(list('def'), [pd.Period('2016-09'), pd.Period('2016-10'), pd.Period('2016-11')])
t2
2016-09    d
2016-10    e
2016-11    f
Freq: M, dtype: object
type(t2.index)
pandas.core.indexes.period.PeriodIndex

Converting to Datetime

d1 = ['2 June 2013', 'Aug 29, 2014', '2015-06-26', '7/12/16']
ts3 = pd.DataFrame(np.random.randint(10, 100, (4,2)), index=d1, columns=list('ab'))
ts3
a b
2 June 2013 94 71
Aug 29, 2014 37 11
2015-06-26 76 76
7/12/16 97 33
ts3.index = pd.to_datetime(ts3.index)
ts3
a b
2013-06-02 94 71
2014-08-29 37 11
2015-06-26 76 76
2016-07-12 97 33
pd.to_datetime('4.7.12', dayfirst=True)
Timestamp('2012-07-04 00:00:00')

Timedeltas

pd.Timestamp('9/3/2016')-pd.Timestamp('9/1/2016')
Timedelta('2 days 00:00:00')
pd.Timestamp('9/2/2016 8:10AM') + pd.Timedelta('12D 3H')
Timestamp('2016-09-14 11:10:00')

Working with Dates in a Dataframe

dates = pd.date_range('10-01-2016', periods=9, freq='2W-SUN')
dates
DatetimeIndex(['2016-10-02', '2016-10-16', '2016-10-30', '2016-11-13',
               '2016-11-27', '2016-12-11', '2016-12-25', '2017-01-08',
               '2017-01-22'],
              dtype='datetime64[ns]', freq='2W-SUN')
df = pd.DataFrame({'Count 1': 100 + np.random.randint(-5, 10, 9).cumsum(),
                  'Count 2': 120 + np.random.randint(-5, 10, 9)}, index=dates)
df
Count 1 Count 2
2016-10-02 108 126
2016-10-16 112 115
2016-10-30 118 117
2016-11-13 113 128
2016-11-27 110 127
2016-12-11 109 125
2016-12-25 114 125
2017-01-08 112 127
2017-01-22 114 117
df.index.weekday_name
Index(['Sunday', 'Sunday', 'Sunday', 'Sunday', 'Sunday', 'Sunday', 'Sunday',
       'Sunday', 'Sunday'],
      dtype='object')
df.diff()
Count 1 Count 2
2016-10-02 NaN NaN
2016-10-16 4.0 -11.0
2016-10-30 6.0 2.0
2016-11-13 -5.0 11.0
2016-11-27 -3.0 -1.0
2016-12-11 -1.0 -2.0
2016-12-25 5.0 0.0
2017-01-08 -2.0 2.0
2017-01-22 2.0 -10.0
df.resample('M').mean()
Count 1 Count 2
2016-10-31 112.666667 119.333333
2016-11-30 111.500000 127.500000
2016-12-31 111.500000 125.000000
2017-01-31 113.000000 122.000000
df['2017']
Count 1 Count 2
2017-01-08 112 127
2017-01-22 114 117
df['2016-12']
Count 1 Count 2
2016-12-11 109 125
2016-12-25 114 125
df['2016-12':]
Count 1 Count 2
2016-12-11 109 125
2016-12-25 114 125
2017-01-08 112 127
2017-01-22 114 117
df.asfreq('W', method='ffill')
Count 1 Count 2
2016-10-02 108 126
2016-10-09 108 126
2016-10-16 112 115
2016-10-23 112 115
2016-10-30 118 117
2016-11-06 118 117
2016-11-13 113 128
2016-11-20 113 128
2016-11-27 110 127
2016-12-04 110 127
2016-12-11 109 125
2016-12-18 109 125
2016-12-25 114 125
2017-01-01 114 125
2017-01-08 112 127
2017-01-15 112 127
2017-01-22 114 117

Leave a Comment