package main
import (
"bufio"
"bytes"
"fmt"
"golang.org/x/net/html"
"gopkg.in/xmlpath.v2"
"io"
"log"
"net/http"
"os"
"runtime"
"strings"
"sync"
"time"
)
type Result struct {
Title string
Keywords string
Description string
}
func GetPage(url string) {
var d Result
request, err := http.NewRequest("Get", url, nil)
if err != nil {
log.Fatal(err)
}
request.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36")
//request.Header.Set("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3")
client := new(http.Client)
response, err := client.Do(request)
if err != nil {
log.Fatal(err)
}
defer response.Body.Close()
brokenHtml := io.Reader(response.Body)
root, err := html.Parse(brokenHtml)
if err != nil {
log.Fatal(err)
}
var b bytes.Buffer
html.Render(&b, root)
fixedHtml := b.String()
reader := strings.NewReader(fixedHtml)
xmlroot, xmlerr := xmlpath.ParseHTML(reader)
if xmlerr != nil {
log.Fatal(xmlerr)
}
xpathTitle := `//title/text()`
xpathKeywords := `//meta[@name="keywords"]/@content`
xpathDescription := `//meta[@name="description"]/@content`
pathTitle := xmlpath.MustCompile(xpathTitle)
if value, ok := pathTitle.String(xmlroot); ok {
d.Title = value
}
pathKeywords := xmlpath.MustCompile(xpathKeywords)
if value, ok := pathKeywords.String(xmlroot); ok {
d.Keywords = value
}
pathDescription := xmlpath.MustCompile(xpathDescription)
if value, ok := pathDescription.String(xmlroot); ok {
d.Description = value
}
log.Printf("%s\t%s\t%s\t%s\n", url, d.Title, d.Keywords, d.Description)
fmt.Printf("%s\t%s\t%s\t%s\n", url, d.Title, d.Keywords, d.Description)
}
func File2Array(filePath string) []string {
f, err := os.Open(filePath)
if err != nil {
fmt.Fprintf(os.Stderr, "File %s could not read: %v\n", filePath, err)
os.Exit(1)
}
defer f.Close()
lines := make([]string, 0, 450)
scanner := bufio.NewScanner(f)
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
if serr := scanner.Err(); serr != nil {
fmt.Fprintf(os.Stderr, "File %s scan error: %v\n", filePath, err)
}
return lines
}
func execLoop(lines []string) {
cpus := runtime.NumCPU()
runtime.GOMAXPROCS(cpus)
var wg sync.WaitGroup
semaphore := make(chan int, cpus)
for _, url := range lines {
wg.Add(1)
go func(url2 string) {
defer wg.Done()
semaphore <- 1
GetPage(url2)
<-semaphore
}(url)
}
wg.Wait()
}
func main() {
filename := "url_list.txt"
lines := File2Array(filename)
start := time.Now()
execLoop(lines)
end := time.Now()
log.Printf("%f seconds\n", (end.Sub(start)).Seconds())
}