ishideo
10/14/2016 - 1:24 AM

meta2tsv.go

package main

import (
	"bufio"
	"bytes"
	"fmt"
	"golang.org/x/net/html"
	"gopkg.in/xmlpath.v2"
	"io"
	"log"
	"net/http"
	"os"
	"runtime"
	"strings"
	"sync"
	"time"
)

type Result struct {
	Title       string
	Keywords    string
	Description string
}

func GetPage(url string) {
	var d Result
	request, err := http.NewRequest("Get", url, nil)
	if err != nil {
		log.Fatal(err)
	}
	request.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36")
	//request.Header.Set("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3")
	client := new(http.Client)
	response, err := client.Do(request)
	if err != nil {
		log.Fatal(err)
	}
	defer response.Body.Close()
	brokenHtml := io.Reader(response.Body)
	root, err := html.Parse(brokenHtml)

	if err != nil {
		log.Fatal(err)
	}

	var b bytes.Buffer
	html.Render(&b, root)
	fixedHtml := b.String()

	reader := strings.NewReader(fixedHtml)
	xmlroot, xmlerr := xmlpath.ParseHTML(reader)

	if xmlerr != nil {
		log.Fatal(xmlerr)
	}

	xpathTitle := `//title/text()`
	xpathKeywords := `//meta[@name="keywords"]/@content`
	xpathDescription := `//meta[@name="description"]/@content`
	pathTitle := xmlpath.MustCompile(xpathTitle)
	if value, ok := pathTitle.String(xmlroot); ok {
		d.Title = value
	}
	pathKeywords := xmlpath.MustCompile(xpathKeywords)
	if value, ok := pathKeywords.String(xmlroot); ok {
		d.Keywords = value
	}
	pathDescription := xmlpath.MustCompile(xpathDescription)
	if value, ok := pathDescription.String(xmlroot); ok {
		d.Description = value
	}
	log.Printf("%s\t%s\t%s\t%s\n", url, d.Title, d.Keywords, d.Description)
	fmt.Printf("%s\t%s\t%s\t%s\n", url, d.Title, d.Keywords, d.Description)
}

func File2Array(filePath string) []string {
	f, err := os.Open(filePath)
	if err != nil {
		fmt.Fprintf(os.Stderr, "File %s could not read: %v\n", filePath, err)
		os.Exit(1)
	}
	defer f.Close()

	lines := make([]string, 0, 450)
	scanner := bufio.NewScanner(f)
	for scanner.Scan() {
		lines = append(lines, scanner.Text())
	}
	if serr := scanner.Err(); serr != nil {
		fmt.Fprintf(os.Stderr, "File %s scan error: %v\n", filePath, err)
	}
	return lines
}

func execLoop(lines []string) {
	cpus := runtime.NumCPU()
	runtime.GOMAXPROCS(cpus)
	var wg sync.WaitGroup
	semaphore := make(chan int, cpus)
	for _, url := range lines {
		wg.Add(1)
		go func(url2 string) {
			defer wg.Done()
			semaphore <- 1
			GetPage(url2)
			<-semaphore
		}(url)
	}
	wg.Wait()
}

func main() {
	filename := "url_list.txt"
	lines := File2Array(filename)
	start := time.Now()
	execLoop(lines)
	end := time.Now()
	log.Printf("%f seconds\n", (end.Sub(start)).Seconds())
}