Programming for Data Science¶

Pandas - Merge And Join¶

Dr. Bhargavi R

SCOPE, VIT Chennai

In [1]:
import numpy as np
import pandas as pd
In [2]:
df3 = pd.DataFrame({
    'Place': ['AMR', 'BHP', 'CHN', 'VELLORE'],
    'State' : ['AP', 'MP', 'TN','TN']
})
df3
Out[2]:
Place State
0 AMR AP
1 BHP MP
2 CHN TN
3 VELLORE TN
In [3]:
df4 = pd.DataFrame({
    'Place': ['AMR', 'BHP', 'CHN', 'VELLORE'],
    'Students' : [3000, 4000, 10000, 50000]
})
df4
Out[3]:
Place Students
0 AMR 3000
1 BHP 4000
2 CHN 10000
3 VELLORE 50000
In [4]:
df_merge1 = pd.merge(df3, df4)
df_merge1
Out[4]:
Place State Students
0 AMR AP 3000
1 BHP MP 4000
2 CHN TN 10000
3 VELLORE TN 50000
In [5]:
df5 = pd.DataFrame({
    'State': ['AP', 'MP', 'TN'],
    'NumberOfCampuses' : [1, 1, 2]
})
df5
Out[5]:
State NumberOfCampuses
0 AP 1
1 MP 1
2 TN 2
In [6]:
df_merge2 = pd.merge(df3, df5)
df_merge2
Out[6]:
Place State NumberOfCampuses
0 AMR AP 1
1 BHP MP 1
2 CHN TN 2
3 VELLORE TN 2
In [7]:
df6 = pd.DataFrame({
    'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']
})
df6
Out[7]:
employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
In [8]:
df7 = pd.DataFrame({'group': ['Accounting', 'Accounting',
                              'Engineering', 'Engineering', 
                              'HR', 'HR'],
                    'skills': ['math', 'spreadsheets', 
                               'coding', 'linux',
                               'spreadsheets', 'organization']})
df7
Out[8]:
group skills
0 Accounting math
1 Accounting spreadsheets
2 Engineering coding
3 Engineering linux
4 HR spreadsheets
5 HR organization
In [9]:
df_merge3 = pd.merge(df6, df7)
df_merge3
Out[9]:
employee group skills
0 Bob Accounting math
1 Bob Accounting spreadsheets
2 Jake Engineering coding
3 Jake Engineering linux
4 Lisa Engineering coding
5 Lisa Engineering linux
6 Sue HR spreadsheets
7 Sue HR organization
In [10]:
df_concat4 = pd.concat([df3, df4], axis = 1)
df_concat4
Out[10]:
Place State Place Students
0 AMR AP AMR 3000
1 BHP MP BHP 4000
2 CHN TN CHN 10000
3 VELLORE TN VELLORE 50000
In [11]:
# Change the name of 'Place' to 'City'

df4 = df4.rename(columns = {'Place' : 'City'})
df4
Out[11]:
City Students
0 AMR 3000
1 BHP 4000
2 CHN 10000
3 VELLORE 50000
In [12]:
# use left_on and right_on when you may wish to merge two datasets with different column names

print(pd.merge(df3, df4, left_on = "Place", right_on = 'City'))
     Place State     City  Students
0      AMR    AP      AMR      3000
1      BHP    MP      BHP      4000
2      CHN    TN      CHN     10000
3  VELLORE    TN  VELLORE     50000
In [13]:
# merge on an index can be used when we want to merge based on index of the dataframes

df1 = df3.set_index('Place')
df2 = df4.set_index('City')

print(df1)
print(df2)
        State
Place        
AMR        AP
BHP        MP
CHN        TN
VELLORE    TN
         Students
City             
AMR          3000
BHP          4000
CHN         10000
VELLORE     50000
In [14]:
pd.merge(df1, df2, left_index = True, right_index = True)
Out[14]:
State Students
Place
AMR AP 3000
BHP MP 4000
CHN TN 10000
VELLORE TN 50000
  • We can combine left_index with right_on or left_on with right_index
  • join() method performs a merge on indices
In [15]:
print(df1.join(df2))
        State  Students
Place                  
AMR        AP      3000
BHP        MP      4000
CHN        TN     10000
VELLORE    TN     50000
In [16]:
df8 = pd.DataFrame({
    'IIT': ['CHN', 'MUM','DEL', 'HYD'],
    'State': ['TN', 'MH', 'DEL', 'TEL']
})
df8
Out[16]:
IIT State
0 CHN TN
1 MUM MH
2 DEL DEL
3 HYD TEL

inner and outer joins¶

In [17]:
pd.merge(df3, df8, how = 'inner')
Out[17]:
Place State IIT
0 CHN TN CHN
1 VELLORE TN CHN
In [18]:
pd.merge(df3, df8, how = 'outer')
Out[18]:
Place State IIT
0 AMR AP NaN
1 BHP MP NaN
2 CHN TN CHN
3 VELLORE TN CHN
4 NaN MH MUM
5 NaN DEL DEL
6 NaN TEL HYD
In [19]:
pd.merge(df3, df8, how = 'inner', left_on = 'Place', right_on = 'IIT')
Out[19]:
Place State_x IIT State_y
0 CHN TN CHN TN