Simple Python Scripts: 2022

Wednesday, 4 May 2022

SVM demo 1

#Data Pre-processing Step

# importing libraries

import numpy as nm

import matplotlib.pyplot as mtp

import pandas as pd

#importing datasets

data_set= pd.read_csv('User_Data.csv')

#Extracting Independent and dependent Variable

x= data_set.iloc[:, [2,3]].values

y= data_set.iloc[:, 4].values

# Splitting the dataset into training and test set.

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.25, random_state=0)

#feature Scaling

from sklearn.preprocessing import StandardScaler

st_x= StandardScaler()

x_train= st_x.fit_transform(x_train)

x_test= st_x.transform(x_test)

from sklearn.svm import SVC # "Support vector classifier"

classifier = SVC(kernel='linear', random_state=0)

classifier.fit(x_train, y_train)

#Predicting the test set result

y_pred= classifier.predict(x_test)

#Creating the Confusion matrix

from sklearn.metrics import confusion_matrix

cm= confusion_matrix(y_test, y_pred)

from matplotlib.colors import ListedColormap

x_set, y_set = x_train, y_train

x1, x2 = nm.meshgrid(nm.arange(start = x_set[:, 0].min() - 1, stop = x_set[:, 0].max() + 1, step =0.01),

nm.arange(start = x_set[:, 1].min() - 1, stop = x_set[:, 1].max() + 1, step = 0.01))

mtp.contourf(x1, x2, classifier.predict(nm.array([x1.ravel(), x2.ravel()]).T).reshape(x1.shape),

alpha = 0.75, cmap = ListedColormap(('red', 'green')))

mtp.xlim(x1.min(), x1.max())

mtp.ylim(x2.min(), x2.max())

for i, j in enumerate(nm.unique(y_set)):

mtp.scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],

c = ListedColormap(('red', 'green'))(i), label = j)

mtp.title('SVM classifier (Training set)')

mtp.xlabel('Age')

mtp.ylabel('Estimated Salary')

mtp.legend()

mtp.show()

#Visulaizing the test set result

from matplotlib.colors import ListedColormap

x_set, y_set = x_test, y_test

x1, x2 = nm.meshgrid(nm.arange(start = x_set[:, 0].min() - 1, stop = x_set[:, 0].max() + 1, step =0.01),

nm.arange(start = x_set[:, 1].min() - 1, stop = x_set[:, 1].max() + 1, step = 0.01))

mtp.contourf(x1, x2, classifier.predict(nm.array([x1.ravel(), x2.ravel()]).T).reshape(x1.shape),

alpha = 0.75, cmap = ListedColormap(('red','green' )))

mtp.xlim(x1.min(), x1.max())

mtp.ylim(x2.min(), x2.max())

for i, j in enumerate(nm.unique(y_set)):

mtp.scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],

c = ListedColormap(('red', 'green'))(i), label = j)

mtp.title('SVM classifier (Test set)')

mtp.xlabel('Age')

mtp.ylabel('Estimated Salary')

mtp.legend()

mtp.show()

Naïve Bayesian Classification Demo 1

Naïve Bayesian Classification Demo 1

%matplotlib inline

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns; sns.set()

from sklearn.datasets import make_blobs

X, y = make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5)

plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu');

from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

model.fit(X, y);

rng = np.random.RandomState(0)

Xnew = [-6, -14] + [14, 18] * rng.rand(2000, 2)

ynew = model.predict(Xnew)

ynew

plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu')

lim = plt.axis()

plt.scatter(Xnew[:, 0], Xnew[:, 1], c=ynew, s=20, cmap='RdBu', alpha=0.1)

plt.axis(lim);

yprob = model.predict_proba(Xnew)

yprob[-8:]. round(2)

Naivy Bay’s Classification Demo 2

Naivy Bay’s Classification Demo 2

import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

import sklearn

dataset = pd.read_csv('Social_Network_Ads.csv')

X = dataset.iloc[:, [1, 2, 3]].values

y = dataset.iloc[:, -1].values

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

X[:,0] = le.fit_transform(X[:,0])

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)

X_test = sc.transform(X_test)

X_train

from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

y_pred

y_test

# Making the Confusion Matrix

from sklearn.metrics import confusion_matrix, accuracy_score

ac = accuracy_score(y_test,y_pred)

cm = confusion_matrix(y_test, y_pred)

K means cluster

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

sns.set()

from sklearn.cluster import KMeans

raw_data = pd.read_csv('Countries_exercise.csv')

Remove the duplicate index column from the dataset.

data·=·raw_data.copy()

plt.scatter(data['Longitude'], data['Latitude'])

plt.xlim(-180,180)

plt.ylim(-90, 90)

plt.show()

Create a copy of that data and remove all parameters apart from Longitude and Latitude.

x = data.iloc[:,1:3]

Clustering

kmeans = KMeans(3)

kmeans.fit(x)

Clustering Results

identified_clusters = kmeans.fit_predict(x)

identified_clusters

data_with_clusters = data.copy()

data_with_clusters['Cluster'] = identified_clusters

data_with_clusters

plt.scatter(data['Longitude'], data['Latitude'],c=data_with_clusters['Cluster'], cmap = 'rainbow')

plt.xlim(-180,180)

plt.ylim(-90, 90)

plt.show()

Random Forest

Practical 5: Random forest model

#First, start with importing necessary Python packages −

import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

#Next, download the iris dataset from its weblink as follows −

path = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

#Next, we need to assign column names to the dataset as follows −

headernames = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']

#Now, we need to read dataset to pandas dataframe as follows −

dataset = pd.read_csv(path, names = headernames)

dataset.head()

#Data Preprocessing will be done with the help of following script lines.

X = dataset.iloc[:, :-1].values

y = dataset.iloc[:, 4].values

#Next, we will divide the data into train and test split. The followingcode will split the dataset into 70% training data and 30% of testing data −

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

#Next, train the model with the help of RandomForestClassifier class of sklearn as follows −

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators = 50)

classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=50)

#At last, we need to make prediction. It can be done with the help of following script −

y_pred = classifier.predict(X_test)

#Next, print the results as follows −

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

result = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")

print(result)

result1 = classification_report(y_test, y_pred)

print("Classification Report:",)

print (result1)

result2 = accuracy_score(y_test,y_pred)

print("Accuracy:",result2)

Building a Logistic Regression

Create a logistic regression based on the bank data provided.

The data is based on the marketing campaign efforts of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).

Note that the first column of the dataset is the index.

Import the relevant libraries

import pandas as pd

import numpy as np

import statsmodels.api as sm

import matplotlib.pyplot as plt

import seaborn as sns

sns.set()

# this part not be needed after the latests updates of the library

from scipy import stats

stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

Load the ‘Example_bank_data.csv’ dataset.

from google.colab import files

uploaded = files.upload()

raw_data = pd.read_csv('Example_bank_data.csv')

raw_data

We want to know whether the bank marketing strategy was successful, so we need to transform the outcome variable into 0s and 1s in order to perform a logistic regression.

# We make sure to create a copy of the data before we start altering itNote that we don't change the original data we loaded.

data = raw_data.copy()

# Removes the index column that came with the data

data = data.drop(['Unnamed: 0'], axis = 1)

# We use the map function to change any 'yes' values to 1 and 'no' values to 0.

data['y'] = data['y'].map({'yes':1, 'no':0})

data

# Check the descriptive statistics

data.describe()

Declare the dependent and independent variables

y = data['y']

x1 = data['duration']

Simple Logistic Regression

x = sm.add_constant(x1)

reg_log = sm.Logit(y,x)

results_log = reg_log.fit()

# Get the regression summary

results_log.summary()

# Create a scatter plot of x1 (Duration, no constant) and y (Subscribed)

plt.scatter(x1,y,color = 'C0')

# Don't forget to label your axes!

plt.xlabel('Duration', fontsize = 20)

plt.ylabel('Subscription', fontsize = 20)

plt.show()

np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)})

#np.set_printoptions(formatter=None)

results_log.predict()

np.array(data['y'])

results_log.pred_table()

cm_df = pd.DataFrame(results_log.pred_table())

cm_df.columns = ['Predicted 0','Predicted 1']

cm_df = cm_df.rename(index={0: 'Actual 0',1:'Actual 1'})

cm_df

cm = np.array(cm_df)

accuracy_train = (cm[0,0]+cm[1,1])/cm.sum()

accuracy_train