From c817edc7a1af486780ae52b43bdfaa5c968b47a4 Mon Sep 17 00:00:00 2001 From: ron Date: Sat, 26 Jun 2021 06:17:14 +0200 Subject: [PATCH] improved --- main.go | 408 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 205 insertions(+), 203 deletions(-) diff --git a/main.go b/main.go index 5451fd2..a2176a6 100644 --- a/main.go +++ b/main.go @@ -19,122 +19,118 @@ import ( "time" "github.com/gocolly/colly" - "github.com/ipfs/go-cid" - ipfs "github.com/ipfs/go-ipfs-api" - "github.com/ipfs/ipfs-cluster/api" - cluster "github.com/ipfs/ipfs-cluster/api/rest/client" - "github.com/multiformats/go-multiaddr" "github.com/schollz/progressbar/v3" ) type Cache struct { MostRecent string Pages int - CID string } -var cacheFile = flag.String("cache", "~/.taldl/cache.json", "cache file") +var cacheFile = "~/.taldl.json" + var outputDir = flag.String("output", "~/TAL", "output directory") -var tmpDir = flag.String("tmpdir", "~/.taldl", "tmp directory") +var formats = flag.String("formats", "zip,epub,pdf,a4.pdf,lt.pdf", "formats to download") +var progress = flag.Bool("progress", true, "show progress bar") +var verbose = flag.Bool("verbose", true, "verbose output") +var workers = flag.Int("workers", 5, "amount of workers") +var update = flag.Bool("update", false, "update all entries") -var formats = flag.String("formats", "zip,epub,pdf,a4.pdf,lt.pdf", "formats to download.") -var fullUpdate = flag.Bool("full", false, "check everything for modifications") -var verbose = flag.Bool("verbose", false, "verbose") -var workers = flag.Int("workers", 1, "amount of workers") - -var ipfsAPI = flag.String("ipfs-api", "localhost:5001", "ipfs api") -var ipfsEnabled = flag.Bool("ipfs", false, "pin to ipfs") - -var ipfsClusterEnabled = flag.Bool("ipfs-cluster", false, "pin to ipfs-cluster") -var ipfsClusterAPI = flag.String("ipfs-cluster-api", "/ip4/127.0.0.1/tcp/9094", "ipfs-cluster api") - -var hrefs = []string{} -var hrefsMutex sync.Mutex var mostRecent string var done bool +var cache Cache +var useFormat map[string]bool func main() { - var wg sync.WaitGroup - var cache Cache - var last int + var bar *progressbar.ProgressBar + var lastPage int + var hrefs = []string{} + var hrefsMutex sync.Mutex + var done bool flag.Parse() - *cacheFile = fixpath(*cacheFile) - cacheData, _ := ioutil.ReadFile(*cacheFile) + if *verbose { + fmt.Printf("writing to %s\r\n", *outputDir) + } + + useFormat = make(map[string]bool) + fmts := strings.Split(*formats, ",") + for _, f := range fmts { + useFormat[f] = true + } + + cacheFile = fixPath(cacheFile) + cacheData, _ := ioutil.ReadFile(cacheFile) _ = json.Unmarshal(cacheData, &cache) - *outputDir = fixpath(*outputDir) - *tmpDir = fixpath(*tmpDir) + *outputDir = fixPath(*outputDir) os.MkdirAll(*outputDir, os.ModePerm) - os.MkdirAll(*tmpDir, os.ModePerm) c := colly.NewCollector() c.OnHTML("ul.pagination li:nth-last-child(2)", func(e *colly.HTMLElement) { - if last == 0 { - last, _ = strconv.Atoi(strings.TrimSpace(e.Text)) - } - wg.Done() + lastPage, _ = strconv.Atoi(strings.TrimSpace(e.Text)) + }) + c.OnHTML("div.amw-listing-item:nth-child(1) a[href]", func(e *colly.HTMLElement) { + href := e.Attr("href") + mostRecent = href }) c.OnHTML("div.amw-listing-item a[href]", func(e *colly.HTMLElement) { - if done { - return - } - href := e.Attr("href") - - hrefsMutex.Lock() - - if len(hrefs) == 0 { - mostRecent = href - } - - if cache.MostRecent == href && *fullUpdate == false { + if cache.MostRecent == href { done = true - } else { + } + if !done || *update { hrefs = append(hrefs, href) } - - hrefsMutex.Unlock() }) - bar := progressbar.Default(1) - wg.Add(1) - c.Visit("https://theanarchistlibrary.org/latest") - wg.Wait() + url := "https://theanarchistlibrary.org/latest/1" + c.Visit(url) - newPages := 0 + newPages := 1 - if *fullUpdate { - newPages = last - 1 + if *update { + newPages = lastPage } else { - newPages = last - cache.Pages - 1 + newPages = lastPage - cache.Pages } - bar.ChangeMax(1 + newPages) - bar.Add(1) - - for i := 0; i < newPages; i++ { - if done { - bar.Add(newPages - i) - break + if newPages > 0 { + if *verbose { + fmt.Fprintf(os.Stderr, "Checking latest entries... \r\n") } - wg.Add(1) - for { - err := c.Visit(fmt.Sprintf("https://theanarchistlibrary.org/latest/%d", i+2)) - if err != nil { - fmt.Fprintf(os.Stderr, "error: %s", err) - } else { - break - } + if *progress { + bar = progressbar.Default(int64(newPages)) + bar.Add(1) } - bar.Add(1) } - wg.Wait() - cache.Pages = last + scanJobs := make(chan int, newPages) + scanResults := make(chan []string, newPages) + + for w := 1; w <= *workers; w++ { + go scanner(w, scanJobs, scanResults) + } + for a := 2; a <= newPages; a++ { + scanJobs <- a + } + close(scanJobs) + + for a := 2; a <= newPages; a++ { + result := <-scanResults + hrefsMutex.Lock() + hrefs = append(hrefs, result...) + hrefsMutex.Unlock() + if *progress { + bar.Add(1) + } + } + close(scanResults) + + cache.Pages = lastPage cache.MostRecent = mostRecent numJobs := len(hrefs) @@ -144,8 +140,14 @@ func main() { return } - bar.Reset() - bar.ChangeMax(numJobs) + if *verbose { + fmt.Fprintf(os.Stderr, "Checking %d entries for updates...\r\n", numJobs) + } + + if *progress { + bar = progressbar.Default(int64(numJobs)) + bar.ChangeMax(numJobs) + } checkJobs := make(chan string, numJobs) checkResults := make(chan string, numJobs) @@ -161,94 +163,59 @@ func main() { } close(checkJobs) - for w := 1; w <= *workers; w++ { - go downloader(w, downloadJobs, downloadResults) - } for a := 1; a <= numJobs; a++ { r := <-checkResults - bar.Add(1) + if *progress { + bar.Add(1) + } if r != "" { - downloadCount++ downloadJobs <- r + downloadCount++ } } close(checkResults) close(downloadJobs) - bar.Finish() - if downloadCount == 0 { save(cache) return } - bar.Reset() - bar.ChangeMax(downloadCount) + if *verbose { + fmt.Fprintf(os.Stderr, "Downloading %d entries...\r\n", downloadCount) + } + + if *progress { + bar = progressbar.Default(int64(downloadCount)) + } + + for w := 1; w <= *workers; w++ { + go downloader(w, downloadJobs, downloadResults) + } buffer := "" for a := 1; a <= downloadCount; a++ { r := <-downloadResults - bar.Add(1) + if *progress { + bar.Add(1) + } if r != "" { buffer += r } } close(downloadResults) - bar.Finish() + save(cache) if buffer != "" { - fmt.Fprintln(os.Stderr, buffer) - } - - if *ipfsEnabled { - sh := ipfs.NewShell(*ipfsAPI) - newCidStr, err := sh.AddDir(*outputDir) - if err != nil { - fmt.Fprintf(os.Stderr, "error: %s", err) - os.Exit(1) + if *verbose { + fmt.Fprintln(os.Stderr, buffer) } - if cache.CID != "" && cache.CID != newCidStr { - sh.Unpin(cache.CID) - } - if *ipfsClusterEnabled { - ma, err := multiaddr.NewMultiaddr(*ipfsClusterAPI) - if err != nil { - fmt.Fprintf(os.Stderr, "error: %s", err) - os.Exit(1) - } - - sh, err := cluster.NewDefaultClient(&cluster.Config{APIAddr: ma}) - if err != nil { - fmt.Fprintf(os.Stderr, "error: %s", err) - os.Exit(1) - } - - newCid, err := cid.Parse(newCidStr) - if err != nil { - fmt.Fprintf(os.Stderr, "error: %s", err) - os.Exit(1) - } - sh.Pin(context.TODO(), newCid, api.PinOptions{}) - - if cache.CID != "" { - oldCid, err := cid.Parse(cache.CID) - if err != nil { - fmt.Fprintf(os.Stderr, "error: %s", err) - os.Exit(1) - } - sh.Unpin(context.TODO(), oldCid) - } - } - cache.CID = newCidStr - save(cache) - - fmt.Fprintln(os.Stderr, newCidStr) } } -func fixpath(path string) string { +func fixPath(path string) string { path = filepath.FromSlash(path) if (path)[0] == '~' { user, err := user.Current() @@ -266,7 +233,7 @@ func filenameForUrl(url string) string { return path.Base(url) } -func head(url string, path string, timeout time.Duration) (modified bool, err error) { +func check(url string, path string, timeout time.Duration) (modified bool, err error) { request, err := http.NewRequest("HEAD", url, nil) if err != nil { return @@ -312,7 +279,7 @@ func head(url string, path string, timeout time.Duration) (modified bool, err er return true, nil } - if file.ModTime().Before(lm) { + if lm.After(file.ModTime()) { return true, nil } @@ -336,6 +303,10 @@ func get(url string, timeout time.Duration) (content []byte, err error) { } defer response.Body.Close() + if response.StatusCode == 404 { + return + } + if response.StatusCode != 200 { return nil, fmt.Errorf("%s: %s", url, response.Status) } @@ -353,13 +324,16 @@ func fileExists(path string) bool { func download(url string, path string) (err error) { for { - data, err := get(url, time.Second*10) + data, err := get(url, time.Second*60) if err != nil { - return err + fmt.Fprintf(os.Stderr, "error: %s: %s\n", url, err) + time.Sleep(time.Second) + continue } err = ioutil.WriteFile(path, data, 0644) if err != nil { + fmt.Fprintf(os.Stderr, "error: %s\n", err) return err } break @@ -418,72 +392,18 @@ func unzip(src string, dest string) ([]string, error) { } func downloader(id int, jobs <-chan string, results chan<- string) { + for href := range jobs { + result := "" dir := filenameForUrl(href) dest := filepath.Join(*outputDir, dir) - os.MkdirAll(dest, 0700) - result := "" - - ext := "zip" - url := href + "." + ext - name := filenameForUrl(url) - path := filepath.Join(*tmpDir, name) - err := download(url, path) - if err != nil { - result += err.Error() + "\r\n" - } - - _, err = unzip(path, *outputDir) - if err != nil { - result += err.Error() + "\r\n" - } - os.Remove(path) - - ext = "epub" - if strings.Contains(*formats, ext) { - url = href + "." + ext - name = filenameForUrl(url) - path = filepath.Join(dest, name) - err = download(url, path) - if err != nil { - result += err.Error() + "\r\n" - } - } - - ext = "pdf" - if strings.Contains(*formats, ext) { - url = href + "." + ext - name = filenameForUrl(url) - path = filepath.Join(dest, name) - err = download(url, path) - if err != nil { - result += err.Error() + "\r\n" - } - } - - ext = "a4.pdf" - if strings.Contains(*formats, ext) { - url = href + "." + ext - name = filenameForUrl(url) - path = filepath.Join(dest, name) - err = download(url, path) - if err != nil { - result += err.Error() + "\r\n" - } - } - - ext = "lt.pdf" - if strings.Contains(*formats, ext) { - url = href + "." + ext - name = filenameForUrl(url) - path = filepath.Join(dest, name) - err = download(url, path) - if err != nil { - result += err.Error() + "\r\n" - } - } + downloadFormat(href, "zip", dest) + downloadFormat(href, "epub", dest) + downloadFormat(href, "pdf", dest) + downloadFormat(href, "a4.pdf", dest) + downloadFormat(href, "lt.pdf", dest) if *verbose { results <- result @@ -493,19 +413,63 @@ func downloader(id int, jobs <-chan string, results chan<- string) { } } +func scanner(id int, jobs <-chan int, results chan<- []string) { + var result []string + + c := colly.NewCollector() + c.OnHTML("div.amw-listing-item a[href]", func(e *colly.HTMLElement) { + href := e.Attr("href") + if cache.MostRecent == href { + done = true + } + if !done { + result = append(result, href) + } + }) + c.AllowURLRevisit = true + + for i := range jobs { + url := fmt.Sprintf("https://theanarchistlibrary.org/latest/%d", i) + for { + result = []string{} + err := c.Visit(url) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %s\n", err) + time.Sleep(time.Second) + } else { + break + } + } + results <- result + } +} + func checker(id int, jobs <-chan string, results chan<- string) { + var modified bool + var err error + for href := range jobs { ext := "muse" url := href + "." + ext dir := filenameForUrl(href) name := filenameForUrl(url) path := filepath.Join(*outputDir, dir, name) - modified, err := head(url, path, time.Second*10) - if err != nil { - fmt.Fprintf(os.Stderr, "error: %s", err) - results <- "" + + if !fileExists(path) { + results <- href continue } + + for { + modified, err = check(url, path, time.Second*30) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %s\n", err) + time.Sleep(time.Second) + continue + } + break + } + if modified { results <- href } else { @@ -516,5 +480,43 @@ func checker(id int, jobs <-chan string, results chan<- string) { func save(cache Cache) { cacheData, _ := json.Marshal(&cache) - _ = ioutil.WriteFile(*cacheFile, cacheData, 0644) + _ = ioutil.WriteFile(cacheFile, cacheData, 0644) +} + +func downloadFormat(href string, ext string, dest string) error { + if !useFormat[ext] { + return nil + } + + url := href + "." + ext + name := filenameForUrl(url) + + if ext == "zip" { + tmpDir, err := ioutil.TempDir(os.TempDir(), "taldl") + if err != nil { + fmt.Fprintf(os.Stderr, "error: %s\n", err) + return err + } + path := filepath.Join(tmpDir, name) + err = download(url, path) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %s\n", err) + return err + } + _, err = unzip(path, *outputDir) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %s\n", err) + return err + } + os.RemoveAll(tmpDir) + } else { + path := filepath.Join(dest, name) + err := download(url, path) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %s\n", err) + return err + } + } + + return nil }