package main import ( "archive/zip" "context" "encoding/json" "flag" "fmt" "io" "io/ioutil" "net/http" "os" "os/user" "path" "path/filepath" "strconv" "strings" "sync" "time" "github.com/gocolly/colly" "github.com/schollz/progressbar/v3" ) type Cache struct { MostRecent string Pages int } var cacheFile = "~/.taldl.json" var outputDir = flag.String("output", "~/TAL", "output directory") var formats = flag.String("formats", "zip,epub,pdf,a4.pdf,lt.pdf", "formats to download") var progress = flag.Bool("progress", true, "show progress bar") var verbose = flag.Bool("verbose", true, "verbose output") var workers = flag.Int("workers", 5, "amount of workers") var update = flag.Bool("update", false, "update all entries") var mostRecent string var done bool var cache Cache var useFormat map[string]bool func main() { var bar *progressbar.ProgressBar var lastPage int var hrefs = []string{} var hrefsMutex sync.Mutex var done bool flag.Parse() useFormat = make(map[string]bool) fmts := strings.Split(*formats, ",") for _, f := range fmts { useFormat[f] = true } cacheFile = fixPath(cacheFile) cacheData, _ := ioutil.ReadFile(cacheFile) _ = json.Unmarshal(cacheData, &cache) *outputDir = fixPath(*outputDir) os.MkdirAll(*outputDir, os.ModePerm) c := colly.NewCollector() c.OnHTML("ul.pagination li:nth-last-child(2)", func(e *colly.HTMLElement) { lastPage, _ = strconv.Atoi(strings.TrimSpace(e.Text)) }) c.OnHTML("div.amw-listing-item:nth-child(1) a[href]", func(e *colly.HTMLElement) { href := e.Attr("href") mostRecent = href }) c.OnHTML("div.amw-listing-item a[href]", func(e *colly.HTMLElement) { href := e.Attr("href") if cache.MostRecent == href { done = true } if !done || *update { hrefs = append(hrefs, href) } }) url := "https://theanarchistlibrary.org/latest/1" c.Visit(url) newPages := 1 if *update { newPages = lastPage } else { newPages = lastPage - cache.Pages } if newPages > 0 { if *verbose { fmt.Fprintf(os.Stderr, "Checking latest entries... \r\n") } if *progress { bar = progressbar.Default(int64(newPages)) bar.Add(1) } } scanJobs := make(chan int, newPages) scanResults := make(chan []string, newPages) for w := 1; w <= *workers; w++ { go scanner(w, scanJobs, scanResults) } for a := 2; a <= newPages; a++ { scanJobs <- a } close(scanJobs) for a := 2; a <= newPages; a++ { result := <-scanResults hrefsMutex.Lock() hrefs = append(hrefs, result...) hrefsMutex.Unlock() if *progress { bar.Add(1) } } close(scanResults) cache.Pages = lastPage cache.MostRecent = mostRecent numJobs := len(hrefs) if numJobs == 0 { save(cache) return } if *verbose { fmt.Fprintf(os.Stderr, "Checking %d entries for updates...\r\n", numJobs) } if *progress { bar = progressbar.Default(int64(numJobs)) bar.ChangeMax(numJobs) } checkJobs := make(chan string, numJobs) checkResults := make(chan string, numJobs) downloadJobs := make(chan string, numJobs) downloadResults := make(chan string, numJobs) downloadCount := 0 for w := 1; w <= *workers; w++ { go checker(w, checkJobs, checkResults) } for _, href := range hrefs { checkJobs <- href } close(checkJobs) for a := 1; a <= numJobs; a++ { r := <-checkResults if *progress { bar.Add(1) } if r != "" { downloadJobs <- r downloadCount++ } } close(checkResults) close(downloadJobs) if downloadCount == 0 { save(cache) return } if *verbose { fmt.Fprintf(os.Stderr, "Downloading %d entries...\r\n", downloadCount) } if *progress { bar = progressbar.Default(int64(downloadCount)) } for w := 1; w <= *workers; w++ { go downloader(w, downloadJobs, downloadResults) } buffer := "" for a := 1; a <= downloadCount; a++ { r := <-downloadResults if *progress { bar.Add(1) } if r != "" { buffer += r } } close(downloadResults) save(cache) if buffer != "" { if *verbose { fmt.Fprintln(os.Stderr, buffer) } } } func fixPath(path string) string { path = filepath.FromSlash(path) if (path)[0] == '~' { user, err := user.Current() if err != nil { fmt.Fprintln(os.Stderr, err.Error()) os.Exit(10) } path = filepath.Join(user.HomeDir, (path)[1:]) } path, _ = filepath.Abs(path) return path } func filenameForUrl(url string) string { return path.Base(url) } func check(url string, path string, timeout time.Duration) (modified bool, err error) { request, err := http.NewRequest("HEAD", url, nil) if err != nil { return } ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() request = request.WithContext(ctx) response, err := http.DefaultClient.Do(request) if err != nil { return } defer response.Body.Close() if response.StatusCode == 404 { return false, nil } if response.StatusCode != 200 { return false, fmt.Errorf("%s: %s", url, response.Status) } lmh := response.Header.Get("Last-Modified") lm, err := http.ParseTime(lmh) if err != nil { return true, err } file, err := os.Stat(path) if err != nil { return true, nil } clh := response.Header.Get("Content-Length") cl, err := strconv.ParseInt(clh, 10, 0) if err != nil { return true, nil } if file.Size() != cl { return true, nil } if lm.After(file.ModTime()) { return true, nil } return false, nil } func get(url string, timeout time.Duration) (content []byte, err error) { request, err := http.NewRequest("GET", url, nil) if err != nil { return } ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() request = request.WithContext(ctx) response, err := http.DefaultClient.Do(request) if err != nil { return } defer response.Body.Close() if response.StatusCode == 404 { return } if response.StatusCode != 200 { return nil, fmt.Errorf("%s: %s", url, response.Status) } return ioutil.ReadAll(response.Body) } func fileExists(path string) bool { info, err := os.Stat(path) if os.IsNotExist(err) { return false } return !info.IsDir() } func download(url string, path string) (err error) { for { data, err := get(url, time.Second*60) if err != nil { fmt.Fprintf(os.Stderr, "error: %s: %s\n", url, err) time.Sleep(time.Second) continue } err = ioutil.WriteFile(path, data, 0644) if err != nil { fmt.Fprintf(os.Stderr, "error: %s\n", err) return err } break } return nil } func unzip(src string, dest string) ([]string, error) { var filenames []string r, err := zip.OpenReader(src) if err != nil { return filenames, err } defer r.Close() for _, f := range r.File { fpath := filepath.Join(dest, f.Name) if !strings.HasPrefix(fpath, filepath.Clean(dest)+string(os.PathSeparator)) { return filenames, fmt.Errorf("%s: bad file path", fpath) } filenames = append(filenames, fpath) if f.FileInfo().IsDir() { os.MkdirAll(fpath, os.ModePerm) continue } if err = os.MkdirAll(filepath.Dir(fpath), os.ModePerm); err != nil { return filenames, err } outFile, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode()) if err != nil { return filenames, err } rc, err := f.Open() if err != nil { return filenames, err } _, err = io.Copy(outFile, rc) outFile.Close() rc.Close() if err != nil { return filenames, err } } return filenames, nil } func downloader(id int, jobs <-chan string, results chan<- string) { for href := range jobs { result := "" dir := filenameForUrl(href) dest := filepath.Join(*outputDir, dir) os.MkdirAll(dest, 0700) downloadFormat(href, "zip", dest) downloadFormat(href, "epub", dest) downloadFormat(href, "pdf", dest) downloadFormat(href, "a4.pdf", dest) downloadFormat(href, "lt.pdf", dest) if *verbose { results <- result } else { results <- "" } } } func scanner(id int, jobs <-chan int, results chan<- []string) { var result []string c := colly.NewCollector() c.OnHTML("div.amw-listing-item a[href]", func(e *colly.HTMLElement) { href := e.Attr("href") if cache.MostRecent == href { done = true } if !done { result = append(result, href) } }) c.AllowURLRevisit = true for i := range jobs { url := fmt.Sprintf("https://theanarchistlibrary.org/latest/%d", i) for { result = []string{} err := c.Visit(url) if err != nil { fmt.Fprintf(os.Stderr, "error: %s\n", err) time.Sleep(time.Second) } else { break } } results <- result } } func checker(id int, jobs <-chan string, results chan<- string) { var modified bool var err error for href := range jobs { ext := "muse" url := href + "." + ext dir := filenameForUrl(href) name := filenameForUrl(url) path := filepath.Join(*outputDir, dir, name) if !fileExists(path) { results <- href continue } for { modified, err = check(url, path, time.Second*30) if err != nil { fmt.Fprintf(os.Stderr, "error: %s\n", err) time.Sleep(time.Second) continue } break } if modified { results <- href } else { results <- "" } } } func save(cache Cache) { cacheData, _ := json.Marshal(&cache) _ = ioutil.WriteFile(cacheFile, cacheData, 0644) } func downloadFormat(href string, ext string, dest string) error { if !useFormat[ext] { return nil } url := href + "." + ext name := filenameForUrl(url) if ext == "zip" { tmpDir, err := ioutil.TempDir(os.TempDir(), "taldl") if err != nil { fmt.Fprintf(os.Stderr, "error: %s\n", err) return err } path := filepath.Join(tmpDir, name) err = download(url, path) if err != nil { fmt.Fprintf(os.Stderr, "error: %s\n", err) return err } _, err = unzip(path, *outputDir) if err != nil { fmt.Fprintf(os.Stderr, "error: %s\n", err) return err } os.RemoveAll(tmpDir) } else { path := filepath.Join(dest, name) err := download(url, path) if err != nil { fmt.Fprintf(os.Stderr, "error: %s\n", err) return err } } return nil }