Parse pictures and info from instagramm
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import time
import re
from urllib.request import urlopen
import json
from pandas.io.json import json_normalize
import pandas as pd, numpy as np
num_pages = 100
hashtag='selfie'
# check for selfiegram, selfiestick, selfiequeen, selfienation, selfiesunday, selfietime, selfies, selfie, selfieofday, selfiesfordays, makeup, beauty, me,
'''
username='vasily.onl'
browser = webdriver.Chrome('C:/chromedriver.exe')
browser.get('https://www.instagram.com/'+username+'/?hl=en')
Pagelength = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
'''
browser = webdriver.Chrome('C:/chromedriver.exe')
browser.get('https://www.instagram.com/explore/tags/'+hashtag)
Pagelength = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
Pagelength = browser.execute_script("window.scrollTo(0, document.body.scrollHeight/1.5);")
links = []
source = browser.page_source
data=bs(source, 'html.parser')
body = data.find('body')
script = body.find('span')
for link in script.findAll('a'):
if re.match("/p", link.get('href')):
links.append('https://www.instagram.com'+link.get('href'))
#sleep time is required. If you don't use this Instagram may interrupt the script and doesn't scroll through pages
time.sleep(5)
for i in range(num_pages-1):
Pagelength = browser.execute_script("window.scrollTo(document.body.scrollHeight/{}, document.body.scrollHeight/{});".format( (i+1) * 1.5, (i+2) * 1.5) )
source = browser.page_source
data=bs(source, 'html.parser')
body = data.find('body')
script = body.find('span')
for link in script.findAll('a'):
if re.match("/p", link.get('href')):
links.append('https://www.instagram.com'+link.get('href'))
time.sleep(5)
result=pd.DataFrame()
for i in range(len(links)):
try:
print(links[i])
page = urlopen(links[i]).read()
data=bs(page, 'html.parser')
body = data.find('body')
script = body.find('script')
raw = script.text.strip().replace('window._sharedData =', '').replace(';', '')
json_data=json.loads(raw)
posts =json_data['entry_data']['PostPage'][0]['graphql']
posts= json.dumps(posts)
posts = json.loads(posts)
x = pd.DataFrame.from_dict(json_normalize(posts), orient='columns')
x.columns = x.columns.str.replace("shortcode_media.", "")
result=result.append(x)
except:
np.nan
# Just check for the duplicates
result = result.drop_duplicates(subset = 'shortcode')
result.index = range(len(result.index))
result.to_csv('{}.csv'.format(hashtag))
import os
import requests
import urllib.request
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
hashtag = 'selfie'
result = pd.read_csv('{}.csv'.format(hashtag))
directory="imgs"
for i in range(len(result)):
if(result['is_video'] == 'FALSE'):
filename = "{}\{}_{}_{}.jpg".format(directory, result['shortcode'][i], int(result['edge_media_preview_comment.count'][i]), int(result['edge_media_preview_like.count'][i]))
urllib.request.urlretrieve(result['display_url'][i], filename)