package main import ( "context" "encoding/json" "flag" "fmt" "io/ioutil" "net/http" "os" "os/user" "path" "path/filepath" "time" "github.com/gocolly/colly" ) var format = flag.String("format", "epub", "format to download") var output = flag.String("output", "epub", "output directory") var store = flag.String("store", "~/.tal-scraper.json", "data store") func main() { flag.Parse() *output = fixpath(*output) *store = fixpath(*store) os.MkdirAll(*output, os.ModePerm) oldMostRecent := "" newMostRecent := "" data, _ := ioutil.ReadFile(*store) _ = json.Unmarshal(data, &oldMostRecent) done := false c := colly.NewCollector() c.OnHTML("div.amw-listing-item a[href]", func(e *colly.HTMLElement) { url := e.Attr("href") + "." + *format if newMostRecent == "" { newMostRecent = url } if newMostRecent == oldMostRecent { done = true return } fmt.Println(url) name := filenameForUrl(url) path := *output + "/" + name data, err := HTTPGet(url, time.Second*3) if err != nil { fmt.Println(url) fmt.Println(err) return } err = ioutil.WriteFile(path, data, 0644) if err != nil { fmt.Println(err) } }) c.OnHTML("a[href] i.fa-chevron-right", func(e *colly.HTMLElement) { next, _ := e.DOM.Parent().Attr("href") if !done { e.Request.Visit(next) } }) c.OnRequest(func(r *colly.Request) { fmt.Println("Checking", r.URL) }) c.Visit("https://theanarchistlibrary.org/latest") data, _ = json.Marshal(newMostRecent) _ = ioutil.WriteFile(*store, data, 0644) } func fixpath(path string) string { path = filepath.FromSlash(path) if (path)[0] == '~' { user, err := user.Current() if err != nil { fmt.Fprintln(os.Stderr, err.Error()) os.Exit(10) } path = filepath.Join(user.HomeDir, (path)[1:]) } path, _ = filepath.Abs(path) return path } func filenameForUrl(url string) string { return path.Base(url) } func HTTPGet(url string, timeout time.Duration) (content []byte, err error) { request, err := http.NewRequest("GET", url, nil) if err != nil { return } ctx, cancel_func := context.WithTimeout(context.Background(), timeout) request = request.WithContext(ctx) response, err := http.DefaultClient.Do(request) if err != nil { return } defer response.Body.Close() if response.StatusCode != 200 { cancel_func() return nil, fmt.Errorf("INVALID RESPONSE; status: %s", response.Status) } return ioutil.ReadAll(response.Body) } func fileExists(path string) bool { info, err := os.Stat(path) if os.IsNotExist(err) { return false } return !info.IsDir() }