Netflix Scrapper

The Algorithms

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Netflix Scrapper\n",
    "\n",
    "The purpose of the code is to get details of all the Categories on Netflix and then to gather information about Sub-Categories and movies under each Sub-Category."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import requests\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_soup(url):\n",
    "    return BeautifulSoup(requests.get(url).text, 'html.parser')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def browseCategory(category, data):\n",
    "    category_url = data[category-1][2]\n",
    "    category = data[category-1][1]\n",
    "    subCategory_details = []\n",
    "    count = 1\n",
    "    subCategories = []\n",
    "    soup = make_soup(category_url)\n",
    "    cards_list = soup.find_all('section',{'class':'nm-collections-row'})\n",
    "    for card in cards_list:\n",
    "        try:\n",
    "            subCategory = card.find('h1').text\n",
    "            movie_list = []\n",
    "            movies = card.find_all('li')\n",
    "            movie_count = 1\n",
    "            for movie in movies:\n",
    "                try:\n",
    "                    movie_title = movie.find('span',{'class':'nm-collections-title-name'}).text\n",
    "                    movie_link = movie.find('a').get('href')\n",
    "                    movie_list.append([movie_count, movie_title , movie_link])\n",
    "                    movie_count += 1\n",
    "                except AttributeError:\n",
    "                    pass\n",
    "            subCategories.append(subCategory)\n",
    "            subCategory_details.append(movie_list)\n",
    "            count += 1\n",
    "        except AttributeError:\n",
    "            pass\n",
    "    return subCategories, subCategory_details, count-1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getCategories(base_url):\n",
    "    category_soup = make_soup(base_url)\n",
    "    categories = category_soup.find_all('section',{'class':'nm-collections-row'})\n",
    "    result=[]\n",
    "    count = 1\n",
    "    for category in categories:\n",
    "        try:\n",
    "            Title = category.find('span', {'class':'nm-collections-row-name'}).text\n",
    "            url = category.find('a').get('href')\n",
    "            result.append([count, Title, url])\n",
    "            count += 1\n",
    "        except AttributeError:\n",
    "            pass\n",
    "    #print(result)\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def main():\n",
    "    netflix_url = \"https://www.netflix.com/in/browse/genre/839338\"\n",
    "    categories = getCategories(netflix_url)\n",
    "    print(\"Please select one of the category\")\n",
    "    df = pd.DataFrame(np.array(categories), columns=['Sr.No', 'Title', 'link'])\n",
    "    print(df.to_string(index=False))\n",
    "    choice = int(input('\\n\\n Please Enter your Choice: \\n'))\n",
    "    subCategories, movieList, count = browseCategory(choice, categories)\n",
    "    for i in range(0, count):\n",
    "        print(subCategories[i],'\\n\\n')\n",
    "        subCategory_df = pd.DataFrame(np.array(movieList[i]), columns=['Sr.No', 'Title', 'link'])\n",
    "        print(subCategory_df.to_string(index=False))\n",
    "        print(\"\\n\\n\\n\")\n",
    "    \n",
    "if __name__ == '__main__':\n",
    "    main()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

Ver en GitHub

Prueba este código

Acerca de este algoritmo

Netflix Scrapper

The purpose of the code is to get details of all the Categories on Netflix and then to gather information about Sub-Categories and movies under each Sub-Category.

from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

def make_soup(url):
    return BeautifulSoup(requests.get(url).text, 'html.parser')

def browseCategory(category, data):
    category_url = data[category-1][2]
    category = data[category-1][1]
    subCategory_details = []
    count = 1
    subCategories = []
    soup = make_soup(category_url)
    cards_list = soup.find_all('section',{'class':'nm-collections-row'})
    for card in cards_list:
        try:
            subCategory = card.find('h1').text
            movie_list = []
            movies = card.find_all('li')
            movie_count = 1
            for movie in movies:
                try:
                    movie_title = movie.find('span',{'class':'nm-collections-title-name'}).text
                    movie_link = movie.find('a').get('href')
                    movie_list.append([movie_count, movie_title , movie_link])
                    movie_count += 1
                except AttributeError:
                    pass
            subCategories.append(subCategory)
            subCategory_details.append(movie_list)
            count += 1
        except AttributeError:
            pass
    return subCategories, subCategory_details, count-1

def getCategories(base_url):
    category_soup = make_soup(base_url)
    categories = category_soup.find_all('section',{'class':'nm-collections-row'})
    result=[]
    count = 1
    for category in categories:
        try:
            Title = category.find('span', {'class':'nm-collections-row-name'}).text
            url = category.find('a').get('href')
            result.append([count, Title, url])
            count += 1
        except AttributeError:
            pass
    #print(result)
    return result

def main():
    netflix_url = "https://www.netflix.com/in/browse/genre/839338"
    categories = getCategories(netflix_url)
    print("Please select one of the category")
    df = pd.DataFrame(np.array(categories), columns=['Sr.No', 'Title', 'link'])
    print(df.to_string(index=False))
    choice = int(input('\n\n Please Enter your Choice: \n'))
    subCategories, movieList, count = browseCategory(choice, categories)
    for i in range(0, count):
        print(subCategories[i],'\n\n')
        subCategory_df = pd.DataFrame(np.array(movieList[i]), columns=['Sr.No', 'Title', 'link'])
        print(subCategory_df.to_string(index=False))
        print("\n\n\n")
    
if __name__ == '__main__':
    main()

¿Qué es un algoritmo?Acerca de Lenguajes de programación Contribuir Donar

GitHub Gitter X Código fuente Contacto