SYildiz
2/6/2019 - 1:12 PM

Amazon Web Scrap - Python

import numpy as np
import pandas as pd 
import requests # First I need to fetch the link
from bs4 import BeautifulSoup

######### Web Scrapping Part #############
##########################################
url_string = ["electronics","baby-products","computers","home","books","apparel",
              "kitchen","office-products","toys","sporting-goods","videogames","home-improvement"]

Elektronik = "electronics"
Bebek = "baby-products"
Bilgisayar = "computers"
Ev = "home"
Kitap = "books"
Moda = "apparel"
Mutfak = "kitchen"
Ofis = "office-products"
Oyuncak = "toys"
Spor = "sporting-goods"
Oyun = "videogames"
Yapı = "home-improvement"

data_link=[]
for i in range(0,len(url_string)):
    url = 'https://www.amazon.com.tr/gp/bestsellers/' + url_string[i] + '/ref=zg_bs_nav_0' #electronics
    dt1 = requests.get(url).text
    data_link.append(dt1)

data=[]
for i in range(0,len(data_link)):
    soup = BeautifulSoup(data_link[i],'lxml')
    df = soup.find_all('li', class_ = 'zg-item-immersion')
    data.append(df)
    
######### Products #############
################################

product=[]
for i in range(0,len(url_string)):
    for j in range(0,len(data[i])):
        pn=data[i][j].a.div.find('img', alt=True)['alt']
        if pn is None:
            product.append("")
            continue
        product.append(pn)
dPN=pd.DataFrame(product, columns=['Product Name'])

######### PRICE #############
###############################

price=[]
for i in range(0,len(url_string)):
    for j in range(0,len(data[i])):
        pr=data[i][j].find('span', class_ = 'p13n-sc-price')
        if pr is None:
            price.append("0")
            continue
        price.append(pr.text)
        
price_int=[]
for i in range(0,len(price)):
    pri = price[i].partition(' TL')[0]
    price_int.append(pri)
    
dPP=pd.DataFrame(price_int, columns=['Price'])

for i in range(0,len(dPP)):
    dPP['Price'][i] = dPP['Price'][i].replace('.','')
    dPP['Price'][i] = dPP['Price'][i].replace(',','.')
    dPP['Price'][i] = float(dPP['Price'][i])

######### Segment #############
###############################

segment=[]
for i in range(0,len(url_string)):
    se=url_string[i]
    for j in range(0,len(data[i])):
        segment.append(se)
        if se is None:
            segment.append("")
            continue
sEG=pd.DataFrame(segment, columns=['Segment'])

######### Country #############
###############################

country=pd.DataFrame(np.repeat("Turkey",len(price),axis=0), columns=['Country'])

######### Merged Data #########
###############################

merged_data_turkey=[]
for i in range(0,len(country)):
    merged_data_turkey.append([dPN['Product Name'][i], dPP['Price'][i], sEG['Segment'][i], country['Country'][i]])

merged_data_turkey = pd.DataFrame(merged_data_turkey, columns=['Product Name', 'Price', 'Segment', 'Country'])

product_data=Bebek

query = merged_data_turkey[merged_data_turkey['Segment']==product_data].head(10).reset_index(drop=True).drop('Segment', axis=1)
query