package main import ( "archive/zip" "context" "encoding/json" "flag" "fmt" "io" "io/ioutil" "net/http" "os" "os/user" "path" "path/filepath" "strconv" "strings" "sync" "time" "github.com/gocolly/colly" "github.com/schollz/progressbar/v3" ) var outputDir = flag.String("output", "~/TAL", "output directory") var formats = flag.String("formats", "zip,epub,pdf,a4.pdf,lt.pdf", "formats to download") var progress = flag.Bool("progress", true, "show progress bar") var verbose = flag.Bool("verbose", true, "verbose output") var workers = flag.Int("workers", 5, "amount of workers") var update = flag.Bool("update", false, "update all entries") type Cache struct { MostRecent string Pages int } var cacheFile = "~/.taldl.json" var cache Cache var mostRecent string var done bool var useFormat map[string]bool var missing []string func main() { var bar *progressbar.ProgressBar var lastPage int var hrefs = []string{} var hrefsMutex sync.Mutex var done bool flag.Parse() useFormat = make(map[string]bool) fmts := strings.Split(*formats, ",") for _, f := range fmts { useFormat[f] = true } cacheFile = fixPath(cacheFile) cacheData, _ := ioutil.ReadFile(cacheFile) _ = json.Unmarshal(cacheData, &cache) *outputDir = fixPath(*outputDir) os.MkdirAll(*outputDir, os.ModePerm) wg := sync.WaitGroup{} c := colly.NewCollector() c.OnHTML("ul.pagination li:nth-last-child(2)", func(e *colly.HTMLElement) { lastPage, _ = strconv.Atoi(strings.TrimSpace(e.Text)) wg.Done() }) c.OnHTML("div.amw-listing-item:nth-child(1) a[href]", func(e *colly.HTMLElement) { href := e.Attr("href") mostRecent = href }) c.OnHTML("div.amw-listing-item a[href]", func(e *colly.HTMLElement) { href := e.Attr("href") if cache.MostRecent == href { done = true } if !done || *update { hrefs = append(hrefs, href) } }) wg.Add(1) url := "https://theanarchistlibrary.org/latest/1" c.Visit(url) wg.Wait() newPages := 1 if *update { newPages = lastPage } else { newPages = lastPage - cache.Pages } if newPages > 0 { if *verbose { fmt.Fprintf(os.Stderr, "Scanning latest entries... \r\n") } if *progress { bar = progressbar.Default(int64(newPages)) bar.Add(1) } } scanJobs := make(chan int, newPages) scanResults := make(chan []string, newPages) for w := 1; w <= *workers; w++ { go scanner(w, scanJobs, scanResults) } for a := 2; a <= newPages; a++ { scanJobs <- a } close(scanJobs) for a := 2; a <= newPages; a++ { result := <-scanResults hrefsMutex.Lock() hrefs = append(hrefs, result...) hrefsMutex.Unlock() if *progress { bar.Add(1) } } close(scanResults) cache.Pages = lastPage cache.MostRecent = mostRecent numJobs := len(hrefs) * len(useFormat) if numJobs == 0 { save(cache) return } if *verbose { fmt.Fprintf(os.Stderr, "Checking %d files for updates...\r\n", numJobs) } if *progress { bar = progressbar.Default(int64(numJobs)) bar.ChangeMax(numJobs) } checkJobs := make(chan string, numJobs) checkResults := make(chan string, numJobs) downloadJobs := make(chan string, numJobs) downloadResults := make(chan string, numJobs) downloadCount := 0 for w := 1; w <= *workers; w++ { go checker(w, checkJobs, checkResults) } for _, href := range hrefs { for ext, use := range useFormat { if use { checkJobs <- href + "." + ext } } } close(checkJobs) for a := 1; a <= numJobs; a++ { r := <-checkResults if *progress { bar.Add(1) } if r != "" { downloadJobs <- r downloadCount++ } } close(checkResults) close(downloadJobs) if downloadCount == 0 { save(cache) return } if *verbose { fmt.Fprintf(os.Stderr, "Downloading %d files...\r\n", downloadCount) } if *progress { bar = progressbar.Default(int64(downloadCount)) } for w := 1; w <= *workers; w++ { go downloader(w, downloadJobs, downloadResults) } buffer := "" for a := 1; a <= downloadCount; a++ { r := <-downloadResults if *progress { bar.Add(1) } if r != "" { buffer += r } } close(downloadResults) save(cache) if buffer != "" { if *verbose { fmt.Fprintln(os.Stderr, buffer) } } if len(missing) > 0 { fmt.Fprintln(os.Stderr, "Not found:") for _, url := range missing { fmt.Fprintln(os.Stderr, url) } } } func fixPath(path string) string { path = filepath.FromSlash(path) if (path)[0] == '~' { user, err := user.Current() if err != nil { fmt.Fprintln(os.Stderr, err.Error()) os.Exit(10) } path = filepath.Join(user.HomeDir, (path)[1:]) } path, _ = filepath.Abs(path) return path } func filenameForUrl(url string) string { return path.Base(url) } func dirnameForUrl(url string) string { base := path.Base(url) ext := filepath.Ext(base) dirname := strings.TrimSuffix(base, ext) subext := []string{".a4", ".lt"} switch ext { case ".pdf": for _, se := range subext { if strings.HasSuffix(dirname, se) { dirname = strings.TrimSuffix(dirname, se) break } } default: } return dirname } func check(url string, path string, timeout time.Duration) (modified bool, err error) { if strings.HasSuffix(url, ".zip") { url = strings.TrimSuffix(url, ".zip") + ".muse" path = strings.TrimSuffix(path, ".zip") + ".muse" } request, err := http.NewRequest("HEAD", url, nil) if err != nil { return false, err } ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() request = request.WithContext(ctx) response, err := http.DefaultClient.Do(request) if err != nil { return false, err } defer response.Body.Close() if response.StatusCode == 404 { fmt.Fprintln(os.Stderr, url+" not found.") return false, nil } if response.StatusCode != 200 { return false, fmt.Errorf("%s: %s", url, response.Status) } lmh := response.Header.Get("Last-Modified") lm, err := http.ParseTime(lmh) if err != nil { return true, err } file, err := os.Stat(path) if err != nil { return true, nil } clh := response.Header.Get("Content-Length") cl, err := strconv.ParseInt(clh, 10, 0) if err != nil { return true, nil } if file.Size() != cl { return true, nil } if lm.After(file.ModTime()) { return true, nil } return false, nil } func get(url string, timeout time.Duration) (content []byte, err error) { request, err := http.NewRequest("GET", url, nil) if err != nil { return } ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() request = request.WithContext(ctx) response, err := http.DefaultClient.Do(request) if err != nil { return } defer response.Body.Close() if response.StatusCode == 404 { missing = append(missing, url) return } if response.StatusCode != 200 { return nil, fmt.Errorf("%s: %s", url, response.Status) } return ioutil.ReadAll(response.Body) } func fileExists(path string) bool { info, err := os.Stat(path) if os.IsNotExist(err) { return false } return !info.IsDir() } func download(url string, path string) (err error) { for { data, err := get(url, time.Second*60) if err != nil { fmt.Fprintf(os.Stderr, "error: %s: %s\n", url, err) time.Sleep(time.Second) continue } if len(data) > 0 { err = ioutil.WriteFile(path, data, 0644) if err != nil { fmt.Fprintf(os.Stderr, "error: %s\n", err) return err } } break } return nil } func unzip(src string, dest string) ([]string, error) { var filenames []string r, err := zip.OpenReader(src) if err != nil { return filenames, err } defer r.Close() for _, f := range r.File { fpath := filepath.Join(dest, f.Name) if !strings.HasPrefix(fpath, filepath.Clean(dest)+string(os.PathSeparator)) { return filenames, fmt.Errorf("%s: bad file path", fpath) } filenames = append(filenames, fpath) if f.FileInfo().IsDir() { os.MkdirAll(fpath, os.ModePerm) continue } if err = os.MkdirAll(filepath.Dir(fpath), os.ModePerm); err != nil { return filenames, err } outFile, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode()) if err != nil { return filenames, err } rc, err := f.Open() if err != nil { return filenames, err } _, err = io.Copy(outFile, rc) outFile.Close() rc.Close() if err != nil { return filenames, err } } return filenames, nil } func downloader(id int, jobs <-chan string, results chan<- string) { for href := range jobs { if strings.HasSuffix(href, ".muse") { href = strings.TrimSuffix(href, ".muse") + ".zip" } result := "" dir := dirnameForUrl(href) dest := filepath.Join(*outputDir, dir) os.MkdirAll(dest, 0700) ext := filepath.Ext(href)[1:] if !useFormat[ext] { return } name := filenameForUrl(href) if ext == "zip" { tmpDir, err := ioutil.TempDir(os.TempDir(), "taldl") if err != nil { fmt.Fprintf(os.Stderr, "error: %s\n", err) continue } path := filepath.Join(tmpDir, name) err = download(href, path) if err != nil { fmt.Fprintf(os.Stderr, "error: %s\n", err) continue } _, err = unzip(path, *outputDir) if err != nil { fmt.Fprintf(os.Stderr, "error: %s\n", err) continue } os.RemoveAll(tmpDir) } else { path := filepath.Join(dest, name) err := download(href, path) if err != nil { fmt.Fprintf(os.Stderr, "error: %s\n", err) continue } } if *verbose { results <- result } else { results <- "" } } } func scanner(id int, jobs <-chan int, results chan<- []string) { var result []string c := colly.NewCollector() c.OnHTML("div.amw-listing-item a[href]", func(e *colly.HTMLElement) { href := e.Attr("href") if cache.MostRecent == href { done = true } if !done { result = append(result, href) } }) c.AllowURLRevisit = true for i := range jobs { url := fmt.Sprintf("https://theanarchistlibrary.org/latest/%d", i) for { result = []string{} err := c.Visit(url) if err != nil { fmt.Fprintf(os.Stderr, "error: %s\n", err) time.Sleep(time.Second) } else { break } } results <- result } } func checker(id int, jobs <-chan string, results chan<- string) { var modified bool var err error for href := range jobs { dir := dirnameForUrl(href) name := filenameForUrl(href) path := filepath.Join(*outputDir, dir, name) if strings.HasSuffix(name, ".zip") { if !fileExists(strings.TrimSuffix(path, ".zip") + ".muse") { results <- href continue } } else { if !fileExists(path) { results <- href continue } } for { modified, err = check(href, path, time.Second*30) if err != nil { fmt.Fprintf(os.Stderr, "error: %s\n", err) time.Sleep(time.Second) continue } break } if modified { results <- href } else { results <- "" } } } func save(cache Cache) { cacheData, _ := json.Marshal(&cache) _ = ioutil.WriteFile(cacheFile, cacheData, 0644) } func downloadFormat(href string, ext string, dest string) error { return nil }