Ajasra
8/24/2019 - 10:44 AM

parse instagram

Parse pictures and info from instagramm

from selenium import webdriver
from bs4 import BeautifulSoup as bs
import time
import re
from urllib.request import urlopen
import json
from pandas.io.json import json_normalize
import pandas as pd, numpy as np


num_pages = 100
hashtag='selfie'
# check for selfiegram, selfiestick, selfiequeen, selfienation, selfiesunday, selfietime, selfies, selfie, selfieofday, selfiesfordays, makeup, beauty, me, 

'''
username='vasily.onl'
browser = webdriver.Chrome('C:/chromedriver.exe')
browser.get('https://www.instagram.com/'+username+'/?hl=en')
Pagelength = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
'''


browser = webdriver.Chrome('C:/chromedriver.exe')
browser.get('https://www.instagram.com/explore/tags/'+hashtag)
Pagelength = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

Pagelength = browser.execute_script("window.scrollTo(0, document.body.scrollHeight/1.5);")

links = []

source = browser.page_source
data=bs(source, 'html.parser')
body = data.find('body')
script = body.find('span')
for link in script.findAll('a'):
     if re.match("/p", link.get('href')):
         links.append('https://www.instagram.com'+link.get('href'))

#sleep time is required. If you don't use this Instagram may interrupt the script and doesn't scroll through pages
time.sleep(5) 

for i in range(num_pages-1):
    Pagelength = browser.execute_script("window.scrollTo(document.body.scrollHeight/{}, document.body.scrollHeight/{});".format( (i+1) * 1.5, (i+2) * 1.5) )
    source = browser.page_source
    data=bs(source, 'html.parser')
    body = data.find('body')
    script = body.find('span')
    for link in script.findAll('a'):
        if re.match("/p", link.get('href')):
            links.append('https://www.instagram.com'+link.get('href'))
    
    time.sleep(5) 


result=pd.DataFrame()
for i in range(len(links)):
    try:
        print(links[i])
        page = urlopen(links[i]).read()
        data=bs(page, 'html.parser')
        body = data.find('body')
        script = body.find('script')
        raw = script.text.strip().replace('window._sharedData =', '').replace(';', '')
        json_data=json.loads(raw)
        posts =json_data['entry_data']['PostPage'][0]['graphql']
        posts= json.dumps(posts)
        posts = json.loads(posts)
        x = pd.DataFrame.from_dict(json_normalize(posts), orient='columns') 
        x.columns =  x.columns.str.replace("shortcode_media.", "")
        result=result.append(x)
       
    except:
        np.nan


# Just check for the duplicates
result = result.drop_duplicates(subset = 'shortcode')
result.index = range(len(result.index))


result.to_csv('{}.csv'.format(hashtag))
import os
import requests
import urllib.request
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

hashtag = 'selfie'

result = pd.read_csv('{}.csv'.format(hashtag))

directory="imgs"
for i in range(len(result)):
    if(result['is_video'] == 'FALSE'):
        filename = "{}\{}_{}_{}.jpg".format(directory, result['shortcode'][i],  int(result['edge_media_preview_comment.count'][i]), int(result['edge_media_preview_like.count'][i]))
        urllib.request.urlretrieve(result['display_url'][i],  filename)