Programming for Data Science¶

Pandas - Concat And Append¶

Dr. Bhargavi R

SCOPE, VIT Chennai

Introduction¶

  • In most of the Real world applications, we have to combine multiple datasets.
  • Sometimes it can be simple by appending or concatenation.
  • It can even be complicated database-style joins and merges that correctly handle any overlaps between the datasets.
  • In this session, we will see how to concatenate and append the dataset using pandas.

The concat() method¶

pd.concat() can be used for a simple concatenation of Series or DataFrame objects.

  • concat signature in pandas 0.23.4 v
pd.concat(objs, axis=0, join='outer', join_axes=None,
        ignore_index=False, keys=None, 
        levels=None, names=None,
        verify_integrity=False, sort=None, copy=True)
In [1]:
import numpy as np
import pandas as pd
In [2]:
df1 = pd.Series(['AMR', 'BHP', 'CHN', 'VEL', 'VEL'], name = 'Place' )
df1
Out[2]:
0    AMR
1    BHP
2    CHN
3    VEL
4    VEL
Name: Place, dtype: object
In [3]:
df2 = pd.Series(['AP', 'MP', 'TN', 'TN', 'TN'], name = 'state')
df2
Out[3]:
0    AP
1    MP
2    TN
3    TN
4    TN
Name: state, dtype: object
In [4]:
df_concat1 = pd.concat([df1, df2])    # Note that default join operation 
                                      # is 'outer' join
df_concat1
Out[4]:
0    AMR
1    BHP
2    CHN
3    VEL
4    VEL
0     AP
1     MP
2     TN
3     TN
4     TN
dtype: object
In [5]:
# Use verify_integrity to check for duplicates
df_concat1 = pd.concat([df1, df2], verify_integrity = True)  
df_concat1
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [5], in <cell line: 2>()
      1 # Use verify_integrity to check for duplicates
----> 2 df_concat1 = pd.concat([df1, df2], verify_integrity = True)  
      3 df_concat1

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/util/_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    305 if len(args) > num_allow_args:
    306     warnings.warn(
    307         msg.format(arguments=arguments),
    308         FutureWarning,
    309         stacklevel=stacklevel,
    310     )
--> 311 return func(*args, **kwargs)

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/core/reshape/concat.py:347, in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
    143 @deprecate_nonkeyword_arguments(version=None, allowed_args=["objs"])
    144 def concat(
    145     objs: Iterable[NDFrame] | Mapping[Hashable, NDFrame],
   (...)
    154     copy: bool = True,
    155 ) -> DataFrame | Series:
    156     """
    157     Concatenate pandas objects along a particular axis with optional set logic
    158     along the other axes.
   (...)
    345     ValueError: Indexes have overlapping values: ['a']
    346     """
--> 347     op = _Concatenator(
    348         objs,
    349         axis=axis,
    350         ignore_index=ignore_index,
    351         join=join,
    352         keys=keys,
    353         levels=levels,
    354         names=names,
    355         verify_integrity=verify_integrity,
    356         copy=copy,
    357         sort=sort,
    358     )
    360     return op.get_result()

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/core/reshape/concat.py:542, in _Concatenator.__init__(self, objs, axis, join, keys, levels, names, ignore_index, verify_integrity, copy, sort)
    539 self.verify_integrity = verify_integrity
    540 self.copy = copy
--> 542 self.new_axes = self._get_new_axes()

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/core/reshape/concat.py:612, in _Concatenator._get_new_axes(self)
    610 def _get_new_axes(self) -> list[Index]:
    611     ndim = self._get_result_dim()
--> 612     return [
    613         self._get_concat_axis if i == self.bm_axis else self._get_comb_axis(i)
    614         for i in range(ndim)
    615     ]

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/core/reshape/concat.py:613, in <listcomp>(.0)
    610 def _get_new_axes(self) -> list[Index]:
    611     ndim = self._get_result_dim()
    612     return [
--> 613         self._get_concat_axis if i == self.bm_axis else self._get_comb_axis(i)
    614         for i in range(ndim)
    615     ]

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/_libs/properties.pyx:37, in pandas._libs.properties.CachedProperty.__get__()

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/core/reshape/concat.py:674, in _Concatenator._get_concat_axis(self)
    669 else:
    670     concat_axis = _make_concat_multiindex(
    671         indexes, self.keys, self.levels, self.names
    672     )
--> 674 self._maybe_check_integrity(concat_axis)
    676 return concat_axis

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/core/reshape/concat.py:682, in _Concatenator._maybe_check_integrity(self, concat_index)
    680 if not concat_index.is_unique:
    681     overlap = concat_index[concat_index.duplicated()].unique()
--> 682     raise ValueError(f"Indexes have overlapping values: {overlap}")

ValueError: Indexes have overlapping values: Int64Index([0, 1, 2, 3, 4], dtype='int64')
In [6]:
# Create a new index ignoring the original index
df_concat1 = pd.concat([df1, df2], ignore_index = True ) 
df_concat1
Out[6]:
0    AMR
1    BHP
2    CHN
3    VEL
4    VEL
5     AP
6     MP
7     TN
8     TN
9     TN
dtype: object
In [7]:
df_concat2 = pd.concat([df1,df2], axis = 1)
df_concat2
Out[7]:
Place state
0 AMR AP
1 BHP MP
2 CHN TN
3 VEL TN
4 VEL TN
In [8]:
df3 = pd.DataFrame({
    'Place': ['AMR', 'BHP', 'CHN', 'VEL'],
    'State' : ['AP', 'MP', 'TN','TN']
})
df3
Out[8]:
Place State
0 AMR AP
1 BHP MP
2 CHN TN
3 VEL TN
In [9]:
df4 = pd.DataFrame({
    'Place': ['AMR', 'BHP', 'CHN', 'VEL'],
    'Students' : [3000, 4000, 10000, 50000]
})
df4
Out[9]:
Place Students
0 AMR 3000
1 BHP 4000
2 CHN 10000
3 VEL 50000
In [10]:
df_concat3 = pd.concat([df3, df4])
df_concat3
Out[10]:
Place State Students
0 AMR AP NaN
1 BHP MP NaN
2 CHN TN NaN
3 VEL TN NaN
0 AMR NaN 3000.0
1 BHP NaN 4000.0
2 CHN NaN 10000.0
3 VEL NaN 50000.0
In [11]:
df_concat3 = pd.concat([df3, df4], join = 'inner')
df_concat3
Out[11]:
Place
0 AMR
1 BHP
2 CHN
3 VEL
0 AMR
1 BHP
2 CHN
3 VEL
In [12]:
df_concat4 = pd.concat([df3, df4], keys = ['df3', 'df4'],
                       names = ['Dataset', 'ind'])

print(df_concat4)
            Place State  Students
Dataset ind                      
df3     0     AMR    AP       NaN
        1     BHP    MP       NaN
        2     CHN    TN       NaN
        3     VEL    TN       NaN
df4     0     AMR   NaN    3000.0
        1     BHP   NaN    4000.0
        2     CHN   NaN   10000.0
        3     VEL   NaN   50000.0
In [13]:
df_concat4.loc[('df4',2)]
Out[13]:
Place           CHN
State           NaN
Students    10000.0
Name: (df4, 2), dtype: object
In [14]:
df_concat4.loc[('df4',2), 'Students']
Out[14]:
10000.0
In [15]:
df_concat4.loc['df4']     # Partial indexing
Out[15]:
Place State Students
ind
0 AMR NaN 3000.0
1 BHP NaN 4000.0
2 CHN NaN 10000.0
3 VEL NaN 50000.0
In [16]:
df_concat4.loc[('df3',2):('df4',2)]
Out[16]:
Place State Students
Dataset ind
df3 2 CHN TN NaN
3 VEL TN NaN
df4 0 AMR NaN 3000.0
1 BHP NaN 4000.0
2 CHN NaN 10000.0
In [17]:
df_concat4.loc[('df3',2):('df4',2), 'Place': 'Students']
Out[17]:
Place State Students
Dataset ind
df3 2 CHN TN NaN
3 VEL TN NaN
df4 0 AMR NaN 3000.0
1 BHP NaN 4000.0
2 CHN NaN 10000.0
In [18]:
# Slicing using IndexSlice
idx = pd.IndexSlice
df_concat4.loc[idx[('df3',2) : ('df4', 2)] ,  idx['Place': 'State']]
Out[18]:
Place State
Dataset ind
df3 2 CHN TN
3 VEL TN
df4 0 AMR NaN
1 BHP NaN
2 CHN NaN
In [19]:
# Multi level index - slicing with iloc
idx = pd.IndexSlice

df_concat4.iloc[idx[1:4, 1]]
Out[19]:
Dataset  ind
df3      1      MP
         2      TN
         3      TN
Name: State, dtype: object
In [20]:
df_concat4 = pd.concat([df3, df4], axis = 1)
df_concat4
Out[20]:
Place State Place Students
0 AMR AP AMR 3000
1 BHP MP BHP 4000
2 CHN TN CHN 10000
3 VEL TN VEL 50000
In [ ]: