549 lines
10 KiB
Go
549 lines
10 KiB
Go
package main
|
|
|
|
import (
|
|
"archive/zip"
|
|
"context"
|
|
"encoding/json"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"net/http"
|
|
"os"
|
|
"os/user"
|
|
"path"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/gocolly/colly"
|
|
"github.com/schollz/progressbar/v3"
|
|
)
|
|
|
|
var outputDir = flag.String("output", "~/TAL", "output directory")
|
|
var formats = flag.String("formats", "zip,epub,pdf,a4.pdf,lt.pdf", "formats to download")
|
|
var progress = flag.Bool("progress", true, "show progress bar")
|
|
var verbose = flag.Bool("verbose", true, "verbose output")
|
|
var workers = flag.Int("workers", 5, "amount of workers")
|
|
var update = flag.Bool("update", false, "update all entries")
|
|
|
|
type Cache struct {
|
|
MostRecent string
|
|
Pages int
|
|
}
|
|
|
|
var cacheFile = "~/.taldl.json"
|
|
var cache Cache
|
|
var mostRecent string
|
|
var done bool
|
|
var useFormat map[string]bool
|
|
var missing []string
|
|
|
|
func main() {
|
|
var bar *progressbar.ProgressBar
|
|
var lastPage int
|
|
var hrefs = []string{}
|
|
var hrefsMutex sync.Mutex
|
|
var done bool
|
|
|
|
flag.Parse()
|
|
|
|
useFormat = make(map[string]bool)
|
|
fmts := strings.Split(*formats, ",")
|
|
for _, f := range fmts {
|
|
useFormat[f] = true
|
|
}
|
|
|
|
cacheFile = fixPath(cacheFile)
|
|
cacheData, _ := ioutil.ReadFile(cacheFile)
|
|
_ = json.Unmarshal(cacheData, &cache)
|
|
|
|
*outputDir = fixPath(*outputDir)
|
|
|
|
os.MkdirAll(*outputDir, os.ModePerm)
|
|
|
|
c := colly.NewCollector()
|
|
c.OnHTML("ul.pagination li:nth-last-child(2)", func(e *colly.HTMLElement) {
|
|
lastPage, _ = strconv.Atoi(strings.TrimSpace(e.Text))
|
|
})
|
|
c.OnHTML("div.amw-listing-item:nth-child(1) a[href]", func(e *colly.HTMLElement) {
|
|
href := e.Attr("href")
|
|
mostRecent = href
|
|
})
|
|
c.OnHTML("div.amw-listing-item a[href]", func(e *colly.HTMLElement) {
|
|
href := e.Attr("href")
|
|
if cache.MostRecent == href {
|
|
done = true
|
|
}
|
|
if !done || *update {
|
|
hrefs = append(hrefs, href)
|
|
}
|
|
})
|
|
|
|
url := "https://theanarchistlibrary.org/latest/1"
|
|
c.Visit(url)
|
|
|
|
newPages := 1
|
|
|
|
if *update {
|
|
newPages = lastPage
|
|
} else {
|
|
newPages = lastPage - cache.Pages
|
|
}
|
|
|
|
if newPages > 0 {
|
|
if *verbose {
|
|
fmt.Fprintf(os.Stderr, "Scanning latest entries... \r\n")
|
|
}
|
|
|
|
if *progress {
|
|
bar = progressbar.Default(int64(newPages))
|
|
bar.Add(1)
|
|
}
|
|
}
|
|
|
|
scanJobs := make(chan int, newPages)
|
|
scanResults := make(chan []string, newPages)
|
|
|
|
for w := 1; w <= *workers; w++ {
|
|
go scanner(w, scanJobs, scanResults)
|
|
}
|
|
for a := 2; a <= newPages; a++ {
|
|
scanJobs <- a
|
|
}
|
|
close(scanJobs)
|
|
|
|
for a := 2; a <= newPages; a++ {
|
|
result := <-scanResults
|
|
hrefsMutex.Lock()
|
|
hrefs = append(hrefs, result...)
|
|
hrefsMutex.Unlock()
|
|
if *progress {
|
|
bar.Add(1)
|
|
}
|
|
}
|
|
close(scanResults)
|
|
|
|
cache.Pages = lastPage
|
|
cache.MostRecent = mostRecent
|
|
|
|
numJobs := len(hrefs) * len(useFormat)
|
|
|
|
if numJobs == 0 {
|
|
save(cache)
|
|
return
|
|
}
|
|
|
|
if *verbose {
|
|
fmt.Fprintf(os.Stderr, "Checking %d files for updates...\r\n", numJobs)
|
|
}
|
|
|
|
if *progress {
|
|
bar = progressbar.Default(int64(numJobs))
|
|
bar.ChangeMax(numJobs)
|
|
}
|
|
|
|
checkJobs := make(chan string, numJobs)
|
|
checkResults := make(chan string, numJobs)
|
|
downloadJobs := make(chan string, numJobs)
|
|
downloadResults := make(chan string, numJobs)
|
|
downloadCount := 0
|
|
|
|
for w := 1; w <= *workers; w++ {
|
|
go checker(w, checkJobs, checkResults)
|
|
}
|
|
|
|
for _, href := range hrefs {
|
|
for ext, use := range useFormat {
|
|
if use {
|
|
checkJobs <- href + "." + ext
|
|
}
|
|
}
|
|
}
|
|
close(checkJobs)
|
|
|
|
for a := 1; a <= numJobs; a++ {
|
|
r := <-checkResults
|
|
if *progress {
|
|
bar.Add(1)
|
|
}
|
|
if r != "" {
|
|
downloadJobs <- r
|
|
downloadCount++
|
|
}
|
|
}
|
|
close(checkResults)
|
|
close(downloadJobs)
|
|
|
|
if downloadCount == 0 {
|
|
save(cache)
|
|
return
|
|
}
|
|
|
|
if *verbose {
|
|
fmt.Fprintf(os.Stderr, "Downloading %d entries...\r\n", downloadCount)
|
|
}
|
|
|
|
if *progress {
|
|
bar = progressbar.Default(int64(downloadCount))
|
|
}
|
|
|
|
for w := 1; w <= *workers; w++ {
|
|
go downloader(w, downloadJobs, downloadResults)
|
|
}
|
|
|
|
buffer := ""
|
|
|
|
for a := 1; a <= downloadCount; a++ {
|
|
r := <-downloadResults
|
|
if *progress {
|
|
bar.Add(1)
|
|
}
|
|
if r != "" {
|
|
buffer += r
|
|
}
|
|
}
|
|
close(downloadResults)
|
|
|
|
save(cache)
|
|
|
|
if buffer != "" {
|
|
if *verbose {
|
|
fmt.Fprintln(os.Stderr, buffer)
|
|
}
|
|
}
|
|
|
|
if len(missing) > 0 {
|
|
fmt.Fprintln(os.Stderr, "Not found:")
|
|
for _, url := range missing {
|
|
fmt.Fprintln(os.Stderr, url)
|
|
}
|
|
}
|
|
}
|
|
|
|
func fixPath(path string) string {
|
|
path = filepath.FromSlash(path)
|
|
if (path)[0] == '~' {
|
|
user, err := user.Current()
|
|
if err != nil {
|
|
fmt.Fprintln(os.Stderr, err.Error())
|
|
os.Exit(10)
|
|
}
|
|
path = filepath.Join(user.HomeDir, (path)[1:])
|
|
}
|
|
path, _ = filepath.Abs(path)
|
|
return path
|
|
}
|
|
|
|
func filenameForUrl(url string) string {
|
|
return path.Base(url)
|
|
}
|
|
|
|
func dirnameForUrl(url string) string {
|
|
base := path.Base(url)
|
|
ext := filepath.Ext(base)
|
|
dirname := strings.TrimSuffix(base, ext)
|
|
subext := []string{".a4", ".lt"}
|
|
|
|
switch ext {
|
|
case ".pdf":
|
|
for _, se := range subext {
|
|
if strings.HasSuffix(dirname, se) {
|
|
dirname = strings.TrimSuffix(dirname, se)
|
|
break
|
|
}
|
|
}
|
|
default:
|
|
}
|
|
return dirname
|
|
}
|
|
|
|
func check(url string, path string, timeout time.Duration) (modified bool, err error) {
|
|
request, err := http.NewRequest("HEAD", url, nil)
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
|
defer cancel()
|
|
|
|
request = request.WithContext(ctx)
|
|
|
|
response, err := http.DefaultClient.Do(request)
|
|
if err != nil {
|
|
return
|
|
}
|
|
defer response.Body.Close()
|
|
|
|
if response.StatusCode == 404 {
|
|
fmt.Fprintln(os.Stderr, url+" was removed?")
|
|
return false, nil
|
|
}
|
|
|
|
if response.StatusCode != 200 {
|
|
return false, fmt.Errorf("%s: %s", url, response.Status)
|
|
}
|
|
|
|
lmh := response.Header.Get("Last-Modified")
|
|
lm, err := http.ParseTime(lmh)
|
|
if err != nil {
|
|
return true, err
|
|
}
|
|
|
|
file, err := os.Stat(path)
|
|
if err != nil {
|
|
return true, nil
|
|
}
|
|
|
|
clh := response.Header.Get("Content-Length")
|
|
cl, err := strconv.ParseInt(clh, 10, 0)
|
|
if err != nil {
|
|
return true, nil
|
|
}
|
|
|
|
if file.Size() != cl {
|
|
return true, nil
|
|
}
|
|
|
|
if lm.After(file.ModTime()) {
|
|
return true, nil
|
|
}
|
|
|
|
return false, nil
|
|
}
|
|
|
|
func get(url string, timeout time.Duration) (content []byte, err error) {
|
|
request, err := http.NewRequest("GET", url, nil)
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
|
defer cancel()
|
|
|
|
request = request.WithContext(ctx)
|
|
|
|
response, err := http.DefaultClient.Do(request)
|
|
if err != nil {
|
|
return
|
|
}
|
|
defer response.Body.Close()
|
|
|
|
if response.StatusCode == 404 {
|
|
// fmt.Fprintln(os.Stderr, url+" NOT FOUND!")
|
|
missing = append(missing, url)
|
|
return
|
|
}
|
|
|
|
if response.StatusCode != 200 {
|
|
return nil, fmt.Errorf("%s: %s", url, response.Status)
|
|
}
|
|
|
|
return ioutil.ReadAll(response.Body)
|
|
}
|
|
|
|
func fileExists(path string) bool {
|
|
info, err := os.Stat(path)
|
|
if os.IsNotExist(err) {
|
|
return false
|
|
}
|
|
return !info.IsDir()
|
|
}
|
|
|
|
func download(url string, path string) (err error) {
|
|
for {
|
|
data, err := get(url, time.Second*60)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "error: %s: %s\n", url, err)
|
|
time.Sleep(time.Second)
|
|
continue
|
|
}
|
|
|
|
if len(data) > 0 {
|
|
err = ioutil.WriteFile(path, data, 0644)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "error: %s\n", err)
|
|
return err
|
|
}
|
|
}
|
|
break
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func unzip(src string, dest string) ([]string, error) {
|
|
var filenames []string
|
|
|
|
r, err := zip.OpenReader(src)
|
|
if err != nil {
|
|
return filenames, err
|
|
}
|
|
defer r.Close()
|
|
|
|
for _, f := range r.File {
|
|
fpath := filepath.Join(dest, f.Name)
|
|
|
|
if !strings.HasPrefix(fpath, filepath.Clean(dest)+string(os.PathSeparator)) {
|
|
return filenames, fmt.Errorf("%s: bad file path", fpath)
|
|
}
|
|
|
|
filenames = append(filenames, fpath)
|
|
|
|
if f.FileInfo().IsDir() {
|
|
os.MkdirAll(fpath, os.ModePerm)
|
|
continue
|
|
}
|
|
|
|
if err = os.MkdirAll(filepath.Dir(fpath), os.ModePerm); err != nil {
|
|
return filenames, err
|
|
}
|
|
|
|
outFile, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode())
|
|
if err != nil {
|
|
return filenames, err
|
|
}
|
|
|
|
rc, err := f.Open()
|
|
if err != nil {
|
|
return filenames, err
|
|
}
|
|
|
|
_, err = io.Copy(outFile, rc)
|
|
|
|
outFile.Close()
|
|
rc.Close()
|
|
|
|
if err != nil {
|
|
return filenames, err
|
|
}
|
|
}
|
|
return filenames, nil
|
|
}
|
|
|
|
func downloader(id int, jobs <-chan string, results chan<- string) {
|
|
for href := range jobs {
|
|
result := ""
|
|
|
|
dir := dirnameForUrl(href)
|
|
|
|
dest := filepath.Join(*outputDir, dir)
|
|
os.MkdirAll(dest, 0700)
|
|
|
|
ext := filepath.Ext(href)[1:]
|
|
|
|
if !useFormat[ext] {
|
|
return
|
|
}
|
|
|
|
name := filenameForUrl(href)
|
|
|
|
if ext == "zip" {
|
|
tmpDir, err := ioutil.TempDir(os.TempDir(), "taldl")
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "error: %s\n", err)
|
|
continue
|
|
}
|
|
path := filepath.Join(tmpDir, name)
|
|
err = download(href, path)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "error: %s\n", err)
|
|
continue
|
|
}
|
|
_, err = unzip(path, *outputDir)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "error: %s\n", err)
|
|
continue
|
|
}
|
|
os.RemoveAll(tmpDir)
|
|
} else {
|
|
path := filepath.Join(dest, name)
|
|
err := download(href, path)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "error: %s\n", err)
|
|
continue
|
|
}
|
|
}
|
|
|
|
if *verbose {
|
|
results <- result
|
|
} else {
|
|
results <- ""
|
|
}
|
|
}
|
|
}
|
|
|
|
func scanner(id int, jobs <-chan int, results chan<- []string) {
|
|
var result []string
|
|
|
|
c := colly.NewCollector()
|
|
c.OnHTML("div.amw-listing-item a[href]", func(e *colly.HTMLElement) {
|
|
href := e.Attr("href")
|
|
if cache.MostRecent == href {
|
|
done = true
|
|
}
|
|
if !done {
|
|
result = append(result, href)
|
|
}
|
|
})
|
|
c.AllowURLRevisit = true
|
|
|
|
for i := range jobs {
|
|
url := fmt.Sprintf("https://theanarchistlibrary.org/latest/%d", i)
|
|
for {
|
|
result = []string{}
|
|
err := c.Visit(url)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "error: %s\n", err)
|
|
time.Sleep(time.Second)
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
results <- result
|
|
}
|
|
}
|
|
|
|
func checker(id int, jobs <-chan string, results chan<- string) {
|
|
var modified bool
|
|
var err error
|
|
|
|
for href := range jobs {
|
|
dir := dirnameForUrl(href)
|
|
name := filenameForUrl(href)
|
|
path := filepath.Join(*outputDir, dir, name)
|
|
|
|
if !fileExists(path) {
|
|
results <- href
|
|
continue
|
|
}
|
|
|
|
for {
|
|
modified, err = check(href, path, time.Second*30)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "error: %s\n", err)
|
|
time.Sleep(time.Second)
|
|
continue
|
|
}
|
|
break
|
|
}
|
|
|
|
if modified {
|
|
results <- href
|
|
} else {
|
|
results <- ""
|
|
}
|
|
}
|
|
}
|
|
|
|
func save(cache Cache) {
|
|
cacheData, _ := json.Marshal(&cache)
|
|
_ = ioutil.WriteFile(cacheFile, cacheData, 0644)
|
|
}
|
|
|
|
func downloadFormat(href string, ext string, dest string) error {
|
|
return nil
|
|
}
|