From 1e050a61f008301601d6c6a104d559b7efe8a6a5 Mon Sep 17 00:00:00 2001 From: ron Date: Sun, 5 Apr 2020 02:08:23 +0200 Subject: [PATCH] init --- go.mod | 18 ++++++++ go.sum | 42 +++++++++++++++++ main.go | 136 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 196 insertions(+) create mode 100644 go.mod create mode 100644 go.sum create mode 100644 main.go diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..dc73581 --- /dev/null +++ b/go.mod @@ -0,0 +1,18 @@ +module taldl + +go 1.14 + +require ( + github.com/PuerkitoBio/goquery v1.5.1 // indirect + github.com/antchfx/htmlquery v1.2.2 // indirect + github.com/antchfx/xmlquery v1.2.3 // indirect + github.com/antchfx/xpath v1.1.4 // indirect + github.com/gobwas/glob v0.2.3 // indirect + github.com/gocolly/colly v1.2.0 + github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect + github.com/kennygrant/sanitize v1.2.4 // indirect + github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect + github.com/temoto/robotstxt v1.1.1 // indirect + golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e // indirect + google.golang.org/appengine v1.6.5 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..3b8f04c --- /dev/null +++ b/go.sum @@ -0,0 +1,42 @@ +github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= +github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= +github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= +github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +github.com/antchfx/htmlquery v1.2.2 h1:exe4hUStBqXdRZ+9nB7EYA+W2zfIHIq3rRFpChh+VSk= +github.com/antchfx/htmlquery v1.2.2/go.mod h1:MS9yksVSQXls00iXkiMqXr0J+umL/AmxXKuP28SUJM8= +github.com/antchfx/xmlquery v1.2.3 h1:++irmxT+Pkn55FGtSTkUTHarZ6E0b1yyR+UiPZRA+eY= +github.com/antchfx/xmlquery v1.2.3/go.mod h1:/+CnyD/DzHRnv2eRxrVbieRU/FIF6N0C+7oTtyUtCKk= +github.com/antchfx/xpath v1.1.4 h1:naPIpjBGeT3eX0Vw7E8iyHsY8FGt6EbGdkcd8EZCo+g= +github.com/antchfx/xpath v1.1.4/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= +github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= +github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= +github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= +github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA= +github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e h1:3G+cUijn7XD+S4eJFddp53Pv7+slrESplyjG25HgL+k= +golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +google.golang.org/appengine v1.6.5 h1:tycE03LOZYQNhDpS27tcQdAzLCVMaj7QT2SXxebnpCM= +google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= diff --git a/main.go b/main.go new file mode 100644 index 0000000..d512f21 --- /dev/null +++ b/main.go @@ -0,0 +1,136 @@ +package main + +import ( + "context" + "encoding/json" + "flag" + "fmt" + "io/ioutil" + "net/http" + "os" + "os/user" + "path" + "path/filepath" + "time" + + "github.com/gocolly/colly" +) + +var format = flag.String("format", "epub", "format to download") +var output = flag.String("output", "epub", "output directory") +var store = flag.String("store", "~/.tal-scraper.json", "data store") + +func main() { + flag.Parse() + + *output = fixpath(*output) + *store = fixpath(*store) + + os.MkdirAll(*output, os.ModePerm) + + oldMostRecent := "" + newMostRecent := "" + + data, _ := ioutil.ReadFile(*store) + _ = json.Unmarshal(data, &oldMostRecent) + + done := false + + c := colly.NewCollector() + + c.OnHTML("div.amw-listing-item a[href]", func(e *colly.HTMLElement) { + url := e.Attr("href") + "." + *format + + if newMostRecent == "" { + newMostRecent = url + } + + if newMostRecent == oldMostRecent { + done = true + return + } + + fmt.Println(url) + + name := filenameForUrl(url) + path := *output + "/" + name + + data, err := HTTPGet(url, time.Second*3) + if err != nil { + fmt.Println(url) + fmt.Println(err) + return + } + + err = ioutil.WriteFile(path, data, 0644) + if err != nil { + fmt.Println(err) + } + }) + + c.OnHTML("a[href] i.fa-chevron-right", func(e *colly.HTMLElement) { + next, _ := e.DOM.Parent().Attr("href") + + if !done { + e.Request.Visit(next) + } + }) + + c.OnRequest(func(r *colly.Request) { + fmt.Println("Checking", r.URL) + }) + + c.Visit("https://theanarchistlibrary.org/latest") + + data, _ = json.Marshal(newMostRecent) + _ = ioutil.WriteFile(*store, data, 0644) +} + +func fixpath(path string) string { + path = filepath.FromSlash(path) + if (path)[0] == '~' { + user, err := user.Current() + if err != nil { + fmt.Fprintln(os.Stderr, err.Error()) + os.Exit(10) + } + path = filepath.Join(user.HomeDir, (path)[1:]) + } + path, _ = filepath.Abs(path) + return path +} + +func filenameForUrl(url string) string { + return path.Base(url) +} + +func HTTPGet(url string, timeout time.Duration) (content []byte, err error) { + request, err := http.NewRequest("GET", url, nil) + if err != nil { + return + } + + ctx, cancel_func := context.WithTimeout(context.Background(), timeout) + request = request.WithContext(ctx) + + response, err := http.DefaultClient.Do(request) + if err != nil { + return + } + defer response.Body.Close() + + if response.StatusCode != 200 { + cancel_func() + return nil, fmt.Errorf("INVALID RESPONSE; status: %s", response.Status) + } + + return ioutil.ReadAll(response.Body) +} + +func fileExists(path string) bool { + info, err := os.Stat(path) + if os.IsNotExist(err) { + return false + } + return !info.IsDir() +}