package main import ( "archive/zip" "context" "encoding/json" "flag" "fmt" "io" "io/ioutil" "net/http" "os" "os/user" "path" "path/filepath" "strconv" "strings" "sync" "time" "github.com/gocolly/colly" "github.com/ipfs/go-cid" ipfs "github.com/ipfs/go-ipfs-api" "github.com/ipfs/ipfs-cluster/api" cluster "github.com/ipfs/ipfs-cluster/api/rest/client" "github.com/multiformats/go-multiaddr" "github.com/schollz/progressbar/v3" ) type Cache struct { MostRecent string Pages int CID string } var cacheFile = flag.String("cache", "~/.taldl/cache.json", "cache file") var outputDir = flag.String("output", "~/TAL", "output directory") var tmpDir = flag.String("tmpdir", "~/.taldl", "tmp directory") var formats = flag.String("formats", "zip,epub,pdf,a4.pdf,lt.pdf", "formats to download.") var fullUpdate = flag.Bool("full", false, "check everything for modifications") var verbose = flag.Bool("verbose", false, "verbose") var workers = flag.Int("workers", 1, "amount of workers") var ipfsAPI = flag.String("ipfs-api", "localhost:5001", "ipfs api") var ipfsEnabled = flag.Bool("ipfs", false, "pin to ipfs") var ipfsClusterEnabled = flag.Bool("ipfs-cluster", false, "pin to ipfs-cluster") var ipfsClusterAPI = flag.String("ipfs-cluster-api", "/ip4/127.0.0.1/tcp/9094", "ipfs-cluster api") var hrefs = []string{} var hrefsMutex sync.Mutex var mostRecent string var done bool func main() { var wg sync.WaitGroup var cache Cache var last int flag.Parse() *cacheFile = fixpath(*cacheFile) cacheData, _ := ioutil.ReadFile(*cacheFile) _ = json.Unmarshal(cacheData, &cache) *outputDir = fixpath(*outputDir) *tmpDir = fixpath(*tmpDir) os.MkdirAll(*outputDir, os.ModePerm) os.MkdirAll(*tmpDir, os.ModePerm) c := colly.NewCollector() c.OnHTML("ul.pagination li:nth-last-child(2)", func(e *colly.HTMLElement) { if last == 0 { last, _ = strconv.Atoi(strings.TrimSpace(e.Text)) } wg.Done() }) c.OnHTML("div.amw-listing-item a[href]", func(e *colly.HTMLElement) { if done { return } href := e.Attr("href") hrefsMutex.Lock() if len(hrefs) == 0 { mostRecent = href } if cache.MostRecent == href && *fullUpdate == false { done = true } else { hrefs = append(hrefs, href) } hrefsMutex.Unlock() }) bar := progressbar.Default(1) wg.Add(1) c.Visit("https://theanarchistlibrary.org/latest") wg.Wait() newPages := 0 if *fullUpdate { newPages = last - 1 } else { newPages = last - cache.Pages - 1 } bar.ChangeMax(1 + newPages) bar.Add(1) for i := 0; i < newPages; i++ { if done { bar.Add(newPages - i) break } wg.Add(1) for { err := c.Visit(fmt.Sprintf("https://theanarchistlibrary.org/latest/%d", i+2)) if err != nil { fmt.Fprintf(os.Stderr, "error: %s", err) } else { break } } bar.Add(1) } wg.Wait() cache.Pages = last cache.MostRecent = mostRecent numJobs := len(hrefs) if numJobs == 0 { save(cache) return } bar.Reset() bar.ChangeMax(numJobs) checkJobs := make(chan string, numJobs) checkResults := make(chan string, numJobs) downloadJobs := make(chan string, numJobs) downloadResults := make(chan string, numJobs) downloadCount := 0 for w := 1; w <= *workers; w++ { go checker(w, checkJobs, checkResults) } for _, href := range hrefs { checkJobs <- href } close(checkJobs) for w := 1; w <= *workers; w++ { go downloader(w, downloadJobs, downloadResults) } for a := 1; a <= numJobs; a++ { r := <-checkResults bar.Add(1) if r != "" { downloadCount++ downloadJobs <- r } } close(checkResults) close(downloadJobs) bar.Finish() if downloadCount == 0 { save(cache) return } bar.Reset() bar.ChangeMax(downloadCount) buffer := "" for a := 1; a <= downloadCount; a++ { r := <-downloadResults bar.Add(1) if r != "" { buffer += r } } close(downloadResults) bar.Finish() save(cache) if buffer != "" { fmt.Fprintln(os.Stderr, buffer) } if *ipfsEnabled { sh := ipfs.NewShell(*ipfsAPI) newCidStr, err := sh.AddDir(*outputDir) if err != nil { fmt.Fprintf(os.Stderr, "error: %s", err) os.Exit(1) } if cache.CID != "" && cache.CID != newCidStr { sh.Unpin(cache.CID) } if *ipfsClusterEnabled { ma, err := multiaddr.NewMultiaddr(*ipfsClusterAPI) if err != nil { fmt.Fprintf(os.Stderr, "error: %s", err) os.Exit(1) } sh, err := cluster.NewDefaultClient(&cluster.Config{APIAddr: ma}) if err != nil { fmt.Fprintf(os.Stderr, "error: %s", err) os.Exit(1) } newCid, err := cid.Parse(newCidStr) if err != nil { fmt.Fprintf(os.Stderr, "error: %s", err) os.Exit(1) } sh.Pin(context.TODO(), newCid, api.PinOptions{}) if cache.CID != "" { oldCid, err := cid.Parse(cache.CID) if err != nil { fmt.Fprintf(os.Stderr, "error: %s", err) os.Exit(1) } sh.Unpin(context.TODO(), oldCid) } } cache.CID = newCidStr save(cache) fmt.Fprintln(os.Stderr, newCidStr) } } func fixpath(path string) string { path = filepath.FromSlash(path) if (path)[0] == '~' { user, err := user.Current() if err != nil { fmt.Fprintln(os.Stderr, err.Error()) os.Exit(10) } path = filepath.Join(user.HomeDir, (path)[1:]) } path, _ = filepath.Abs(path) return path } func filenameForUrl(url string) string { return path.Base(url) } func head(url string, path string, timeout time.Duration) (modified bool, err error) { request, err := http.NewRequest("HEAD", url, nil) if err != nil { return } ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() request = request.WithContext(ctx) response, err := http.DefaultClient.Do(request) if err != nil { return } defer response.Body.Close() if response.StatusCode == 404 { return false, nil } if response.StatusCode != 200 { return false, fmt.Errorf("%s: %s", url, response.Status) } lmh := response.Header.Get("Last-Modified") lm, err := http.ParseTime(lmh) if err != nil { return true, err } file, err := os.Stat(path) if err != nil { return true, nil } clh := response.Header.Get("Content-Length") cl, err := strconv.ParseInt(clh, 10, 0) if err != nil { return true, nil } if file.Size() != cl { return true, nil } if file.ModTime().Before(lm) { return true, nil } return false, nil } func get(url string, timeout time.Duration) (content []byte, err error) { request, err := http.NewRequest("GET", url, nil) if err != nil { return } ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() request = request.WithContext(ctx) response, err := http.DefaultClient.Do(request) if err != nil { return } defer response.Body.Close() if response.StatusCode != 200 { return nil, fmt.Errorf("%s: %s", url, response.Status) } return ioutil.ReadAll(response.Body) } func fileExists(path string) bool { info, err := os.Stat(path) if os.IsNotExist(err) { return false } return !info.IsDir() } func download(url string, path string) (err error) { for { data, err := get(url, time.Second*10) if err != nil { return err } err = ioutil.WriteFile(path, data, 0644) if err != nil { return err } break } return nil } func unzip(src string, dest string) ([]string, error) { var filenames []string r, err := zip.OpenReader(src) if err != nil { return filenames, err } defer r.Close() for _, f := range r.File { fpath := filepath.Join(dest, f.Name) if !strings.HasPrefix(fpath, filepath.Clean(dest)+string(os.PathSeparator)) { return filenames, fmt.Errorf("%s: bad file path", fpath) } filenames = append(filenames, fpath) if f.FileInfo().IsDir() { os.MkdirAll(fpath, os.ModePerm) continue } if err = os.MkdirAll(filepath.Dir(fpath), os.ModePerm); err != nil { return filenames, err } outFile, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode()) if err != nil { return filenames, err } rc, err := f.Open() if err != nil { return filenames, err } _, err = io.Copy(outFile, rc) outFile.Close() rc.Close() if err != nil { return filenames, err } } return filenames, nil } func downloader(id int, jobs <-chan string, results chan<- string) { for href := range jobs { dir := filenameForUrl(href) dest := filepath.Join(*outputDir, dir) os.MkdirAll(dest, 0700) result := "" ext := "zip" url := href + "." + ext name := filenameForUrl(url) path := filepath.Join(*tmpDir, name) err := download(url, path) if err != nil { result += err.Error() + "\r\n" } _, err = unzip(path, *outputDir) if err != nil { result += err.Error() + "\r\n" } os.Remove(path) ext = "epub" if strings.Contains(*formats, ext) { url = href + "." + ext name = filenameForUrl(url) path = filepath.Join(dest, name) err = download(url, path) if err != nil { result += err.Error() + "\r\n" } } ext = "pdf" if strings.Contains(*formats, ext) { url = href + "." + ext name = filenameForUrl(url) path = filepath.Join(dest, name) err = download(url, path) if err != nil { result += err.Error() + "\r\n" } } ext = "a4.pdf" if strings.Contains(*formats, ext) { url = href + "." + ext name = filenameForUrl(url) path = filepath.Join(dest, name) err = download(url, path) if err != nil { result += err.Error() + "\r\n" } } ext = "lt.pdf" if strings.Contains(*formats, ext) { url = href + "." + ext name = filenameForUrl(url) path = filepath.Join(dest, name) err = download(url, path) if err != nil { result += err.Error() + "\r\n" } } if *verbose { results <- result } else { results <- "" } } } func checker(id int, jobs <-chan string, results chan<- string) { for href := range jobs { ext := "muse" url := href + "." + ext dir := filenameForUrl(href) name := filenameForUrl(url) path := filepath.Join(*outputDir, dir, name) modified, err := head(url, path, time.Second*10) if err != nil { fmt.Fprintf(os.Stderr, "error: %s", err) results <- "" continue } if modified { results <- href } else { results <- "" } } } func save(cache Cache) { cacheData, _ := json.Marshal(&cache) _ = ioutil.WriteFile(*cacheFile, cacheData, 0644) }