NaN and NoneNone and NaN as essentially interchangable for indicating missing or null values.import numpy as np
import pandas as pd
a = np.array([1, 2, 3, None, 5])
a
array([1, 2, 3, None, 5], dtype=object)
# Any mathematical operation on None results in error
a.sum()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Input In [3], in <cell line: 2>() 1 # Any mathematical operation on None results in error ----> 2 a.sum() File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/numpy/core/_methods.py:48, in _sum(a, axis, dtype, out, keepdims, initial, where) 46 def _sum(a, axis=None, dtype=None, out=None, keepdims=False, 47 initial=_NoValue, where=True): ---> 48 return umr_sum(a, axis, dtype, out, keepdims, initial, where) TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'
# Let's add a number to None
None + 3
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Input In [4], in <cell line: 2>() 1 # Let's add a number to None ----> 2 None + 3 TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'
NaN¶a = np.array([1, 2, 3, np.NaN, 5])
a
array([ 1., 2., 3., nan, 5.])
# Any mathematical operation on None results in nan
print(a.sum())
# NumPy does provide some special aggregations that will ignore these missing values
print(np.nansum(a), np.nanmin(a))
nan 11.0 1.0
Null¶data_with_missing_val = pd.Series([1, np.nan, 'hello', None])
print(data_with_missing_val.isnull())
0 False 1 True 2 False 3 True dtype: bool
# obtaining values which are not null
data_with_missing_val[data_with_missing_val.notnull()]
0 1 2 hello dtype: object
# dictionary of lists
dict_ = {
'First' : [90, 90, np.nan, 95],
'Second' : [33, 45, 56, np.nan],
'Third' : [np.nan, 45, 80, 98],
'Fourth' : [50, 60, 60, 70],
'Fifth' : [33, 45, 56, None]
}
# create a dataframe from dictionary of lists
df = pd.DataFrame(dict_)
# use isnull() function to check missing data
df.isnull()
| First | Second | Third | Fourth | Fifth | |
|---|---|---|---|---|---|
| 0 | False | False | True | False | False |
| 1 | False | False | False | False | False |
| 2 | True | False | False | False | False |
| 3 | False | True | False | False | True |
NaN¶df['First'].notna()
0 True 1 True 2 False 3 True Name: First, dtype: bool
df.fillna(0, inplace = True)
df
| First | Second | Third | Fourth | Fifth | |
|---|---|---|---|---|---|
| 0 | 90.0 | 33.0 | 0.0 | 50 | 33.0 |
| 1 | 90.0 | 45.0 | 45.0 | 60 | 45.0 |
| 2 | 0.0 | 56.0 | 80.0 | 60 | 56.0 |
| 3 | 95.0 | 0.0 | 98.0 | 70 | 0.0 |
# Fill the missing values with previous one
df = pd.DataFrame(dict_)
print(df)
df.fillna(method ='pad') # method -'bfill' can be used to fill with next values
First Second Third Fourth Fifth 0 90.0 33.0 NaN 50 33.0 1 90.0 45.0 45.0 60 45.0 2 NaN 56.0 80.0 60 56.0 3 95.0 NaN 98.0 70 NaN
| First | Second | Third | Fourth | Fifth | |
|---|---|---|---|---|---|
| 0 | 90.0 | 33.0 | NaN | 50 | 33.0 |
| 1 | 90.0 | 45.0 | 45.0 | 60 | 45.0 |
| 2 | 90.0 | 56.0 | 80.0 | 60 | 56.0 |
| 3 | 95.0 | 56.0 | 98.0 | 70 | 56.0 |
# Fill using replace
df = pd.DataFrame(dict_)
df
| First | Second | Third | Fourth | Fifth | |
|---|---|---|---|---|---|
| 0 | 90.0 | 33.0 | NaN | 50 | 33.0 |
| 1 | 90.0 | 45.0 | 45.0 | 60 | 45.0 |
| 2 | NaN | 56.0 | 80.0 | 60 | 56.0 |
| 3 | 95.0 | NaN | 98.0 | 70 | NaN |
df.replace(to_replace = np.nan, value = -99, inplace = True)
df
| First | Second | Third | Fourth | Fifth | |
|---|---|---|---|---|---|
| 0 | 90.0 | 33.0 | -99.0 | 50 | 33.0 |
| 1 | 90.0 | 45.0 | 45.0 | 60 | 45.0 |
| 2 | -99.0 | 56.0 | 80.0 | 60 | 56.0 |
| 3 | 95.0 | -99.0 | 98.0 | 70 | -99.0 |
# Drop all the rows which have NaN
df = pd.DataFrame(dict_)
df
| First | Second | Third | Fourth | Fifth | |
|---|---|---|---|---|---|
| 0 | 90.0 | 33.0 | NaN | 50 | 33.0 |
| 1 | 90.0 | 45.0 | 45.0 | 60 | 45.0 |
| 2 | NaN | 56.0 | 80.0 | 60 | 56.0 |
| 3 | 95.0 | NaN | 98.0 | 70 | NaN |
df.dropna(inplace = True) # how = 'all' option can be used to drop only
# if all the values in a row are NaN
print('After dropping')
df
After dropping
| First | Second | Third | Fourth | Fifth | |
|---|---|---|---|---|---|
| 1 | 90.0 | 45.0 | 45.0 | 60 | 45.0 |
# Fill with mean of a column for the entire DF
df = pd.DataFrame(dict_)
df
| First | Second | Third | Fourth | Fifth | |
|---|---|---|---|---|---|
| 0 | 90.0 | 33.0 | NaN | 50 | 33.0 |
| 1 | 90.0 | 45.0 | 45.0 | 60 | 45.0 |
| 2 | NaN | 56.0 | 80.0 | 60 | 56.0 |
| 3 | 95.0 | NaN | 98.0 | 70 | NaN |
df.fillna(df.mean(), inplace = True)
df
| First | Second | Third | Fourth | Fifth | |
|---|---|---|---|---|---|
| 0 | 90.000000 | 33.000000 | 74.333333 | 50 | 33.000000 |
| 1 | 90.000000 | 45.000000 | 45.000000 | 60 | 45.000000 |
| 2 | 91.666667 | 56.000000 | 80.000000 | 60 | 56.000000 |
| 3 | 95.000000 | 44.666667 | 98.000000 | 70 | 44.666667 |
df = pd.DataFrame(dict_)
df
| First | Second | Third | Fourth | Fifth | |
|---|---|---|---|---|---|
| 0 | 90.0 | 33.0 | NaN | 50 | 33.0 |
| 1 | 90.0 | 45.0 | 45.0 | 60 | 45.0 |
| 2 | NaN | 56.0 | 80.0 | 60 | 56.0 |
| 3 | 95.0 | NaN | 98.0 | 70 | NaN |
means = df['First'].mean()
means
91.66666666666667
filled = df['First'].fillna(means)
df['First'] = filled # Assign to 'First' column so that DF is changed
df
| First | Second | Third | Fourth | Fifth | |
|---|---|---|---|---|---|
| 0 | 90.000000 | 33.0 | NaN | 50 | 33.0 |
| 1 | 90.000000 | 45.0 | 45.0 | 60 | 45.0 |
| 2 | 91.666667 | 56.0 | 80.0 | 60 | 56.0 |
| 3 | 95.000000 | NaN | 98.0 | 70 | NaN |
# Another way to fill with mean
df = pd.DataFrame(dict_)
df
| First | Second | Third | Fourth | Fifth | |
|---|---|---|---|---|---|
| 0 | 90.0 | 33.0 | NaN | 50 | 33.0 |
| 1 | 90.0 | 45.0 | 45.0 | 60 | 45.0 |
| 2 | NaN | 56.0 | 80.0 | 60 | 56.0 |
| 3 | 95.0 | NaN | 98.0 | 70 | NaN |
df.fillna(df.mean()['Third':'Fifth'])
| First | Second | Third | Fourth | Fifth | |
|---|---|---|---|---|---|
| 0 | 90.0 | 33.0 | 74.333333 | 50 | 33.000000 |
| 1 | 90.0 | 45.0 | 45.000000 | 60 | 45.000000 |
| 2 | NaN | 56.0 | 80.000000 | 60 | 56.000000 |
| 3 | 95.0 | NaN | 98.000000 | 70 | 44.666667 |
from sklearn.impute import KNNImputer
nan = np.nan
X = [
[1, 2, nan],
[3, 4, 3],
[nan, 6, 5],
[8, 8, 7]
]
imputer = KNNImputer(n_neighbors=2)
imputer.fit_transform(X)
array([[1. , 2. , 4. ],
[3. , 4. , 3. ],
[5.5, 6. , 5. ],
[8. , 8. , 7. ]])
| Nulls | NaNs |
|---|---|
| isnull() | |
| notnull() | notna() |
| dropna() | |
| fillna() | |
| replace() | replace() |
Above are the few of the commonly used methods in Pandas for working with missing data. There are even more which you can explore in the Pandas documentation.