This commit is contained in:
ron 2020-04-05 02:08:23 +02:00
commit 1e050a61f0
3 changed files with 196 additions and 0 deletions

18
go.mod Normal file
View File

@ -0,0 +1,18 @@
module taldl
go 1.14
require (
github.com/PuerkitoBio/goquery v1.5.1 // indirect
github.com/antchfx/htmlquery v1.2.2 // indirect
github.com/antchfx/xmlquery v1.2.3 // indirect
github.com/antchfx/xpath v1.1.4 // indirect
github.com/gobwas/glob v0.2.3 // indirect
github.com/gocolly/colly v1.2.0
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect
github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
github.com/temoto/robotstxt v1.1.1 // indirect
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e // indirect
google.golang.org/appengine v1.6.5 // indirect
)

42
go.sum Normal file
View File

@ -0,0 +1,42 @@
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/antchfx/htmlquery v1.2.2 h1:exe4hUStBqXdRZ+9nB7EYA+W2zfIHIq3rRFpChh+VSk=
github.com/antchfx/htmlquery v1.2.2/go.mod h1:MS9yksVSQXls00iXkiMqXr0J+umL/AmxXKuP28SUJM8=
github.com/antchfx/xmlquery v1.2.3 h1:++irmxT+Pkn55FGtSTkUTHarZ6E0b1yyR+UiPZRA+eY=
github.com/antchfx/xmlquery v1.2.3/go.mod h1:/+CnyD/DzHRnv2eRxrVbieRU/FIF6N0C+7oTtyUtCKk=
github.com/antchfx/xpath v1.1.4 h1:naPIpjBGeT3eX0Vw7E8iyHsY8FGt6EbGdkcd8EZCo+g=
github.com/antchfx/xpath v1.1.4/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY=
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI=
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA=
github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e h1:3G+cUijn7XD+S4eJFddp53Pv7+slrESplyjG25HgL+k=
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
google.golang.org/appengine v1.6.5 h1:tycE03LOZYQNhDpS27tcQdAzLCVMaj7QT2SXxebnpCM=
google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=

136
main.go Normal file
View File

@ -0,0 +1,136 @@
package main
import (
"context"
"encoding/json"
"flag"
"fmt"
"io/ioutil"
"net/http"
"os"
"os/user"
"path"
"path/filepath"
"time"
"github.com/gocolly/colly"
)
var format = flag.String("format", "epub", "format to download")
var output = flag.String("output", "epub", "output directory")
var store = flag.String("store", "~/.tal-scraper.json", "data store")
func main() {
flag.Parse()
*output = fixpath(*output)
*store = fixpath(*store)
os.MkdirAll(*output, os.ModePerm)
oldMostRecent := ""
newMostRecent := ""
data, _ := ioutil.ReadFile(*store)
_ = json.Unmarshal(data, &oldMostRecent)
done := false
c := colly.NewCollector()
c.OnHTML("div.amw-listing-item a[href]", func(e *colly.HTMLElement) {
url := e.Attr("href") + "." + *format
if newMostRecent == "" {
newMostRecent = url
}
if newMostRecent == oldMostRecent {
done = true
return
}
fmt.Println(url)
name := filenameForUrl(url)
path := *output + "/" + name
data, err := HTTPGet(url, time.Second*3)
if err != nil {
fmt.Println(url)
fmt.Println(err)
return
}
err = ioutil.WriteFile(path, data, 0644)
if err != nil {
fmt.Println(err)
}
})
c.OnHTML("a[href] i.fa-chevron-right", func(e *colly.HTMLElement) {
next, _ := e.DOM.Parent().Attr("href")
if !done {
e.Request.Visit(next)
}
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Checking", r.URL)
})
c.Visit("https://theanarchistlibrary.org/latest")
data, _ = json.Marshal(newMostRecent)
_ = ioutil.WriteFile(*store, data, 0644)
}
func fixpath(path string) string {
path = filepath.FromSlash(path)
if (path)[0] == '~' {
user, err := user.Current()
if err != nil {
fmt.Fprintln(os.Stderr, err.Error())
os.Exit(10)
}
path = filepath.Join(user.HomeDir, (path)[1:])
}
path, _ = filepath.Abs(path)
return path
}
func filenameForUrl(url string) string {
return path.Base(url)
}
func HTTPGet(url string, timeout time.Duration) (content []byte, err error) {
request, err := http.NewRequest("GET", url, nil)
if err != nil {
return
}
ctx, cancel_func := context.WithTimeout(context.Background(), timeout)
request = request.WithContext(ctx)
response, err := http.DefaultClient.Do(request)
if err != nil {
return
}
defer response.Body.Close()
if response.StatusCode != 200 {
cancel_func()
return nil, fmt.Errorf("INVALID RESPONSE; status: %s", response.Status)
}
return ioutil.ReadAll(response.Body)
}
func fileExists(path string) bool {
info, err := os.Stat(path)
if os.IsNotExist(err) {
return false
}
return !info.IsDir()
}