opexxx
7/23/2018 - 11:44 AM

* dumb split on first : * dumb convert any unknown utf-8 to surrogates and replace * can re-run on db if db has unique constraint on user&pa

  • dumb split on first : * dumb convert any unknown utf-8 to surrogates and replace * can re-run on db if db has unique constraint on user&pass tuple
CREATE DATABASE pw
    WITH 
    OWNER = XXXXXXXXXXXXXX
    ENCODING = 'UTF8'
    LC_COLLATE = 'en_US.UTF-8'
    LC_CTYPE = 'en_US.UTF-8'
    TABLESPACE = pg_default
    CONNECTION LIMIT = -1;

CREATE TABLE public.passwords
(
    password_id bigint NOT NULL DEFAULT nextval('passwords_password_id_seq'::regclass),
    password_username text COLLATE pg_catalog."C.UTF-8",
    password_password text COLLATE pg_catalog."default",
    CONSTRAINT passwords_pkey PRIMARY KEY (password_id)
)
WITH (
    OIDS = FALSE
)
TABLESPACE pg_default;

ALTER TABLE public.passwords
    OWNER to XXXXXXXXXXXXXX;

-- Index: i_password

-- DROP INDEX public.i_password;

CREATE INDEX i_password
    ON public.passwords USING btree
    (password_password COLLATE pg_catalog."default" varchar_ops)
    TABLESPACE pg_default;

-- Index: i_username

-- DROP INDEX public.i_username;

CREATE INDEX i_username
    ON public.passwords USING btree
    (password_username COLLATE pg_catalog."C.UTF-8" bpchar_pattern_ops)
    TABLESPACE pg_default;

-- Index: u_up

-- DROP INDEX public.u_up;

CREATE UNIQUE INDEX u_up
    ON public.passwords USING btree
    (password_username COLLATE pg_catalog."C.UTF-8", password_password COLLATE pg_catalog."default")
    TABLESPACE pg_default;
#!/usr/bin/env python3

# by Gunni @ github.com 2017-12-13

import glob
import sys
import os.path
import pathlib
import psycopg2
from random import shuffle
from psycopg2 import sql

files = glob.glob('data/*/*', recursive=True)

connect_str = "dbname='pw' user='xxxxxxxxxx' host='xxxxxxxxxx' password='xxxxxxxxxx'"
conn = psycopg2.connect(connect_str)
cursor = conn.cursor()

def mark(c):
    sys.stdout.write(c)
    sys.stdout.flush()

shuffle(files)

for pwfile in files:
    print('{} > '.format(pwfile), end = '')

    if os.path.isdir(os.path.join('done', pwfile)):
        print('file is in done')
        continue

    n = 0
    b = 0
    with open(pwfile, 'r', encoding='utf-8', errors='surrogateescape') as f:
        try:
            for line in f:
                up = line.encode('utf-8', errors='surrogateescape').decode('utf8', 'replace').split(':')
                user = up[0]
                if len(up) != 2:
                    passw = ''
                else:
                    passw = ':'.join(up[1:])

                try:
                    cursor.execute('insert into passwords (password_username, password_password) values (%s, %s)', [user, passw])
                    n += 1
                    if n % 1000 == 0:
                        conn.commit()
                        mark('.')
                except psycopg2.IntegrityError as e:
                    conn.rollback()
                    b += 1
                    if b % 1000 == 0:
                        mark('!')
                except ValueError as e:
                    conn.rollback()
                    passw = '\\0'
                    mark('0')

        except UnicodeDecodeError as e:
            print(e)
            raise e

    print()
    pathlib.Path(os.path.join('done', pwfile)).mkdir(parents=True, exist_ok=True)

print('all done...')