import numpy as np
import pandas as pd


a = np.array([1, 2, 3, None, 5])
a

array([1, 2, 3, None, 5], dtype=object)


# Any mathematical operation on None results in error
a.sum()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Input In [3], in <cell line: 2>()
      1 # Any mathematical operation on None results in error
----> 2 a.sum()

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/numpy/core/_methods.py:48, in _sum(a, axis, dtype, out, keepdims, initial, where)
     46 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
     47          initial=_NoValue, where=True):
---> 48     return umr_sum(a, axis, dtype, out, keepdims, initial, where)

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'


# Let's add a number to None
None + 3

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Input In [4], in <cell line: 2>()
      1 # Let's add a number to None
----> 2 None + 3

TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'


a = np.array([1, 2, 3, np.NaN, 5])
a

array([ 1.,  2.,  3., nan,  5.])


# Any mathematical operation on None results in nan
print(a.sum())

# NumPy does provide some special aggregations that will ignore these missing values
print(np.nansum(a), np.nanmin(a))

nan
11.0 1.0


data_with_missing_val = pd.Series([1, np.nan, 'hello', None])
print(data_with_missing_val.isnull())

0    False
1     True
2    False
3     True
dtype: bool


# obtaining values which are not null
data_with_missing_val[data_with_missing_val.notnull()]

0        1
2    hello
dtype: object


# dictionary of lists 
dict_ = {
    'First'  : [90, 90, np.nan, 95],
    'Second' : [33, 45, 56, np.nan],
    'Third'  : [np.nan, 45, 80, 98],
    'Fourth' : [50, 60, 60, 70],
    'Fifth'  : [33, 45, 56, None]
} 

# create a dataframe from dictionary of lists
df = pd.DataFrame(dict_) 

# use isnull() function to check missing data  
df.isnull()


df['First'].notna()

0     True
1     True
2    False
3     True
Name: First, dtype: bool


df.fillna(0, inplace = True)
df


# Fill the missing values with previous one
df = pd.DataFrame(dict_)
print(df)


df.fillna(method ='pad')         # method -'bfill' can be used to fill with next values

   First  Second  Third  Fourth  Fifth
0   90.0    33.0    NaN      50   33.0
1   90.0    45.0   45.0      60   45.0
2    NaN    56.0   80.0      60   56.0
3   95.0     NaN   98.0      70    NaN


# Fill using replace
df = pd.DataFrame(dict_)
df


df.replace(to_replace = np.nan, value = -99, inplace = True) 
df


# Drop all the rows which have NaN 
df = pd.DataFrame(dict_)
df


df.dropna(inplace = True) # how = 'all' option can be used to drop only 
                          # if all the values in a row are NaN
print('After dropping')
df

After dropping


# Fill with mean of a column for the entire DF 
df = pd.DataFrame(dict_)
df


df.fillna(df.mean(), inplace = True)
df


df = pd.DataFrame(dict_)
df


means = df['First'].mean()
means

91.66666666666667


filled      = df['First'].fillna(means) 
df['First'] = filled                    # Assign to 'First' column so that DF is changed

df


# Another way to fill with mean
df = pd.DataFrame(dict_)
df


df.fillna(df.mean()['Third':'Fifth'])


from sklearn.impute import KNNImputer

nan = np.nan
X   = [
         [1, 2, nan], 
         [3, 4, 3], 
         [nan, 6, 5], 
         [8, 8, 7]
]

imputer = KNNImputer(n_neighbors=2)
imputer.fit_transform(X)

array([[1. , 2. , 4. ],
       [3. , 4. , 3. ],
       [5.5, 6. , 5. ],
       [8. , 8. , 7. ]])

	First	Second	Third	Fourth	Fifth
0	90.000000	33.000000	74.333333	50	33.000000
1	90.000000	45.000000	45.000000	60	45.000000
2	91.666667	56.000000	80.000000	60	56.000000
3	95.000000	44.666667	98.000000	70	44.666667

Nulls	NaNs
isnull()
notnull()	notna()
	dropna()
	fillna()
replace()	replace()

Programming for Data Science¶

Pandas - Handling Missing data¶

Numpy and `NaN`¶

Identifying `Null`¶

Working with `NaN`¶

Imputation using KNN¶

Summary¶

	First	Second	Third	Fourth	Fifth
0	False	False	True	False	False
1	False	False	False	False	False
2	True	False	False	False	False
3	False	True	False	False	True

	First	Second	Third	Fourth	Fifth
0	90.0	33.0	0.0	50	33.0
1	90.0	45.0	45.0	60	45.0
2	0.0	56.0	80.0	60	56.0
3	95.0	0.0	98.0	70	0.0

	First	Second	Third	Fourth	Fifth
0	90.0	33.0	-99.0	50	33.0
1	90.0	45.0	45.0	60	45.0
2	-99.0	56.0	80.0	60	56.0
3	95.0	-99.0	98.0	70	-99.0

Programming for Data Science¶

Pandas - Handling Missing data¶

Numpy and NaN¶

Identifying Null¶

Working with NaN¶

Imputation using KNN¶

Summary¶

Numpy and `NaN`¶

Identifying `Null`¶

Working with `NaN`¶