lwzm
7/26/2019 - 5:56 AM

google play crawler

#!/usr/bin/env python3

from datetime import date, datetime
from pony.orm import *


db = Database()


class App(db.Entity):
    id = PrimaryKey(str, 800)
    name = Optional(str, 100)
    author = Optional(str, 100)
    cate = Optional(str, 100)
    scored = Optional(str, 50)
    installed = Optional(str, 50)
    dl = Optional(datetime)
    ts = Optional(datetime, sql_default="(datetime('now', 'localtime'))")


if __name__ == '__main__':
    db.bind('sqlite', filename=':memory:')
    db.generate_mapping(create_tables=True)
    with db_session:
        App(id='a')
    with db_session:
        a = App['a']
        t = a.ts
        print(t, type(t))
else:
    @db.on_connect(provider="sqlite")
    def _home_sqliterc(_, conn):
        import pathlib
        rc = pathlib.Path.home() / ".sqliterc"
        rc.exists() and conn.executescript(rc.read_text())
    db.bind('sqlite', filename='db')
    db.generate_mapping(create_tables=True)
#!/usr/bin/env python3

from datetime import date, datetime
from collections import deque

from requests_html import HTMLSession
session = HTMLSession()

from entities import App, db_session, select

q = deque()


def init_app(id):
    with db_session:
        app = App.get(id=id)
        if not app:
            app = App(id=id)
            return True


def fetch(id):
    if id.startswith("https://"):
        link = id
    else:
        link = f"https://play.google.com/store/apps/details?id={id}"

    init_app(id)

    with db_session:
        app = App[id]
        if app.dl:
            return
        r = session.get(link)
        app.dl = datetime.now()
        if not id.startswith('https://'):
            with open(f'apps/{id}', 'wb') as f:
                f.write(r.html.raw_html)

    for link in r.html.absolute_links:
        if not link.startswith("https://play.google.com/store/apps/"):
            continue
        if link.startswith("https://play.google.com/store/apps/details"):
            id = link.rpartition("id=")[2]
        else:
            id = link
        if init_app(id):
            q.append(id)


def main():
    with db_session:
        q.extend(select(i.id for i in App if i.dl is None))
    if not q:
        q.append("https://play.google.com/store/apps/top")
    while True:
        print(len(q))
        if not q:
            break
        link = q.popleft()
        if 'collection/cluster' in link:
            continue
        fetch(link)


if __name__ == '__main__':
    main()
#!/usr/bin/env python3

import re
from entities import App, db_session, select

pattern = r'Downloaded (\d+) times'


def get_downloaded(id):
    with db_session:
        app = App[id]
        result = app.installed
        if result:
            return result
        with open(f"apps/{id}") as f:
            result = re.search(pattern, f.read())
        result = result and result.groups()[0] or '0'
        app.installed = result
        return result


def main():
    with db_session:
        l = select(i.id for i in App if not i.id.startswith('https://') and i.dl)[:]

    #print(l)
    cnt = []
    for id in l:
        n = int(get_downloaded(id))
        cnt.append((n, id, ))
    cnt.sort(reverse=True)
    for n, id in cnt:
        print(n, id, sep='\t')


if __name__ == '__main__':
    main()