02-Imdb-Binary-Classification

H
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Hussnain\\Anaconda3\\envs\\tensorflow\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "#Imports\n",
    "from keras.datasets import imdb\n",
    "\n",
    "from keras import models\n",
    "from keras import layers\n",
    "from keras import optimizers\n",
    "from keras import losses\n",
    "from keras import metrics,activations\n",
    "\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz\n",
      " 1048576/17464789 [>.............................] - ETA: 53:49"
     ]
    }
   ],
   "source": [
    "#Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz\n",
    "\n",
    "(xtrain,ytrain), (xtest, ytest) = imdb.load_data(num_words=10000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Exploring the dataset\n",
    "\n",
    "print('xtrain shape', xtrain.shape)\n",
    "print('ytrain shape', ytrain.shape)\n",
    "print()\n",
    "print('xtest shape', xtest.shape)\n",
    "print('ytest shape', ytest.shape)\n",
    "print()\n",
    "print('xtrain first review as dictionary index', xtrain[1])\n",
    "print()\n",
    "print()\n",
    "print('ytrain label', ytrain[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#index to words mapping\n",
    "word_index = imdb.get_word_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "decode_review = ' '.join([reverse_word_index.get(i-3, reverse_word_index.get(i)) for i in xtrain[22]])\n",
    "decode_review"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "def vectorize_sequences(sequences, dimension=10000):\n",
    "    results = np.zeros((len(sequences), dimension))\n",
    "    for i, sequence in enumerate(sequences):\n",
    "        results[i, sequence] = 1. \n",
    "    return results\n",
    "\n",
    "x_train = vectorize_sequences(xtrain)\n",
    "x_test = vectorize_sequences(xtest)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ytrain = np.asarray(ytrain).astype('float32')\n",
    "ytest = np.asarray(ytest).astype('float32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#model\n",
    "model = models.Sequential()\n",
    "model.add(layers.Dense(16, activation=activations.relu, input_shape=(10000,)))\n",
    "model.add(layers.Dense(16, activation=activations.relu))\n",
    "model.add(layers.Dense(1, activation=activations.sigmoid))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(optimizer=optimizers.RMSprop(lr=0.0001), loss=losses.mse, metrics=['acc'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_val = x_train[:10000]\n",
    "y_val = ytrain[:10000]\n",
    "\n",
    "x_train_partial = x_train[10000:]\n",
    "y_train_partial = ytrain[10000:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "history = model.fit(x_train_partial, y_train_partial, epochs=4, batch_size=512, validation_data=(x_val,y_val))\n",
    "history_dict = history.history\n",
    "history_dict.keys()\n",
    "print(history.history['acc'][-1])\n",
    "print(history.history['val_acc'][-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(model.predict(x_train_partial[22:23]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "loss = history_dict['loss']\n",
    "val_loss = history_dict['val_loss']\n",
    "epochs = range(0, len(loss)+1)\n",
    "epochs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib\n",
    "acc = history.history['acc']\n",
    "val_acc = history.history['val_acc']\n",
    "loss = history.history['loss']\n",
    "val_loss = history.history['val_loss']\n",
    "\n",
    "epochs = range(1, len(acc) + 1)\n",
    "\n",
    "# \"bo\" is for \"blue dot\"\n",
    "plt.plot(epochs, loss, 'ro', label='Training loss')\n",
    "# b is for \"solid blue line\"\n",
    "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n",
    "plt.title('Training and validation loss')\n",
    "plt.xlabel('Epochs')\n",
    "plt.ylabel('Loss')\n",
    "plt.legend()\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.clf()      # clear figure# clear  \n",
    "acc_values = history_dict['acc']\n",
    "val_acc_values = history_dict['val_acc']\n",
    "\n",
    "plt.plot(epochs, acc, 'bo', label='Training acc')\n",
    "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n",
    "plt.title('Training and validation accuracy')\n",
    "plt.xlabel('Epochs')\n",
    "plt.ylabel('Loss')\n",
    "plt.legend()\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
About this Algorithm
#Imports
from keras.datasets import imdb

from keras import models
from keras import layers
from keras import optimizers
from keras import losses
from keras import metrics,activations

import matplotlib.pyplot as plt
C:\Users\Hussnain\Anaconda3\envs\tensorflow\lib\site-packages\h5py\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
#Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz

(xtrain,ytrain), (xtest, ytest) = imdb.load_data(num_words=10000)
Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
 1048576/17464789 [>.............................] - ETA: 53:49
#Exploring the dataset

print('xtrain shape', xtrain.shape)
print('ytrain shape', ytrain.shape)
print()
print('xtest shape', xtest.shape)
print('ytest shape', ytest.shape)
print()
print('xtrain first review as dictionary index', xtrain[1])
print()
print()
print('ytrain label', ytrain[0])
#index to words mapping
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
decode_review = ' '.join([reverse_word_index.get(i-3, reverse_word_index.get(i)) for i in xtrain[22]])
decode_review
import numpy as np

def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1. 
    return results

x_train = vectorize_sequences(xtrain)
x_test = vectorize_sequences(xtest)
ytrain = np.asarray(ytrain).astype('float32')
ytest = np.asarray(ytest).astype('float32')
#model
model = models.Sequential()
model.add(layers.Dense(16, activation=activations.relu, input_shape=(10000,)))
model.add(layers.Dense(16, activation=activations.relu))
model.add(layers.Dense(1, activation=activations.sigmoid))
model.compile(optimizer=optimizers.RMSprop(lr=0.0001), loss=losses.mse, metrics=['acc'])
x_val = x_train[:10000]
y_val = ytrain[:10000]

x_train_partial = x_train[10000:]
y_train_partial = ytrain[10000:]
history = model.fit(x_train_partial, y_train_partial, epochs=4, batch_size=512, validation_data=(x_val,y_val))
history_dict = history.history
history_dict.keys()
print(history.history['acc'][-1])
print(history.history['val_acc'][-1])
print(model.predict(x_train_partial[22:23]))
loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(0, len(loss)+1)
epochs
%matplotlib
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'ro', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()
plt.clf()      # clear figure# clear  
acc_values = history_dict['acc']
val_acc_values = history_dict['val_acc']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()