sklearn can be a good option for preprocessing data
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
(x-x_min)/(x_max - x_min)
data = np.random.randint(0,100,(10,2))
data
scale_model = MinMaxScaler()
scale_model
#fit and transform are usually separate, because we fit on training data and tranform on test data
scale_model.fit_transform(data)
features = np.random.randint(0,101,(50,3))
labels = np.random.randint(0,5,(50,1))
data = np.concatenate((features,labels),axis=1)
df = pd.DataFrame(data,columns = ['f1','f2','f3','labels'])
df[:10]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features,labels,test_size=0.33,random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
from sklearn.datasets import make_blobs
data = make_blobs(n_samples=20,n_features=2,centers=2,random_state=75) #centers=[number of classes]
data
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(data[0][:,0],data[0][:,1],c=data[1])
diabetes = pd.read_csv("files/pima-indians-diabetes.csv")
diabetes.head()
Depricated:
Use Label encoder to change String labels to Integer. And then use OneHotEncoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
Change characerts in Groups to Integers with LabelEncoder
cols = diabetes.columns
arr = diabetes.values
le = LabelEncoder()
le_fit = le.fit(arr[:,9]) #fit is used on test data
arr[:,9] = le_fit.transform(arr[:,9])
pd.DataFrame(arr,columns=cols).head()
Change Age and Group to OneHotEncoding
oe = OneHotEncoder(categorical_features=[7,9]) #Column numbers on which transformation should be applied
oe_fit = oe.fit(arr) #fir is used on test data
arr_new = oe_fit.transform(arr).toarray()
pd.DataFrame(arr_new).head()
PROBLEM: The Categories considered for AGE are only the ones encountered in training data. If we get a new age, the code breaks
oe_fit.categories_
Prefered method :
Using Column Transformers. Also eliminated the need for Label encoders
from sklearn.compose import ColumnTransformer
cols = diabetes.columns
arr = diabetes.values
#Specify OneHOtEncoders separately for each column. To manually specify categories, use categories option
#9 ==> Group
#7 ==> Age
colT = ColumnTransformer([("1",OneHotEncoder(),[9]),("2",OneHotEncoder(categories=[np.arange(0,100,1)]),[7])])
colT_fit = colT.fit(arr)
Now you can see all the specified categories have been encoded
np.array(colT_fit.get_feature_names())
tr = colT_fit.transform(arr).toarray() #Returns just the transformed cols
tr.shape, arr.shape
ColumnTransforms returns only the transformed columns. So we have to drop the old columns from the original array and concatenate it with the transformed columns
arr_new = np.concatenate((arr[:,[0,1,2,3,4,5,6,8]],tr),axis=1) #specify all columns excetp 7 and 9
arr_new.shape
pd.DataFrame(arr_new).head()