taldl/main.go
2021-06-25 07:26:45 +02:00

459 lines
8.6 KiB
Go

package main
import (
"archive/zip"
"context"
"encoding/json"
"flag"
"fmt"
"io"
"io/ioutil"
"net/http"
"os"
"os/user"
"path"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
"github.com/gocolly/colly"
"github.com/schollz/progressbar/v3"
)
type Store struct {
MostRecent string
Pages int
}
var formats = flag.String("formats", "zip,epub,pdf,a4.pdf,lt.pdf", "formats to download.")
var fullUpdate = flag.Bool("full", false, "check everything for modifications")
var verbose = flag.Bool("verbose", false, "verbose")
var workers = flag.Int("workers", 1, "amount of workers")
var storeFile = flag.String("store", "~/.taldl/cache.json", "store file")
var outputDir = flag.String("output", "~/TAL", "output directory")
var zipDir = flag.String("zipdir", "~/.taldl", "zip directory")
var hrefs = []string{}
var hrefsMutex sync.Mutex
var mostRecent string
var done bool
func main() {
var wg sync.WaitGroup
var store Store
var last int
flag.Parse()
*storeFile = fixpath(*storeFile)
storeData, _ := ioutil.ReadFile(*storeFile)
_ = json.Unmarshal(storeData, &store)
*outputDir = fixpath(*outputDir)
*zipDir = fixpath(*zipDir)
os.MkdirAll(*outputDir, os.ModePerm)
os.MkdirAll(*zipDir, os.ModePerm)
c := colly.NewCollector()
c.OnHTML("ul.pagination li:nth-last-child(2)", func(e *colly.HTMLElement) {
if last == 0 {
last, _ = strconv.Atoi(strings.TrimSpace(e.Text))
}
wg.Done()
})
c.OnHTML("div.amw-listing-item a[href]", func(e *colly.HTMLElement) {
if done {
return
}
href := e.Attr("href")
hrefsMutex.Lock()
if len(hrefs) == 0 {
mostRecent = href
}
if store.MostRecent == href && *fullUpdate == false {
done = true
} else {
hrefs = append(hrefs, href)
}
hrefsMutex.Unlock()
})
bar := progressbar.Default(1)
wg.Add(1)
c.Visit("https://theanarchistlibrary.org/latest")
wg.Wait()
newPages := 0
if *fullUpdate {
newPages = last - 1
} else {
newPages = last - store.Pages - 1
}
bar.ChangeMax(1 + newPages)
bar.Add(1)
for i := 0; i < newPages; i++ {
if done {
bar.Add(newPages - i)
break
}
wg.Add(1)
for {
err := c.Visit(fmt.Sprintf("https://theanarchistlibrary.org/latest/%d", i+2))
if err != nil {
fmt.Println(err)
} else {
break
}
}
bar.Add(1)
}
wg.Wait()
store.Pages = last
store.MostRecent = mostRecent
numJobs := len(hrefs)
if numJobs == 0 {
save(store)
return
}
bar.Reset()
bar.ChangeMax(numJobs)
checkJobs := make(chan string, numJobs)
checkResults := make(chan string, numJobs)
downloadJobs := make(chan string, numJobs)
downloadResults := make(chan string, numJobs)
downloadCount := 0
for w := 1; w <= *workers; w++ {
go checker(w, checkJobs, checkResults)
}
for _, href := range hrefs {
checkJobs <- href
}
close(checkJobs)
for w := 1; w <= *workers; w++ {
go downloader(w, downloadJobs, downloadResults)
}
for a := 1; a <= numJobs; a++ {
r := <-checkResults
bar.Add(1)
if r != "" {
downloadCount++
downloadJobs <- r
}
}
close(checkResults)
close(downloadJobs)
bar.Finish()
if downloadCount == 0 {
save(store)
return
}
bar.Reset()
bar.ChangeMax(downloadCount)
for a := 1; a <= downloadCount; a++ {
r := <-downloadResults
bar.Add(1)
if r != "" {
fmt.Println(r)
}
}
close(downloadResults)
bar.Finish()
save(store)
}
func fixpath(path string) string {
path = filepath.FromSlash(path)
if (path)[0] == '~' {
user, err := user.Current()
if err != nil {
fmt.Fprintln(os.Stderr, err.Error())
os.Exit(10)
}
path = filepath.Join(user.HomeDir, (path)[1:])
}
path, _ = filepath.Abs(path)
return path
}
func filenameForUrl(url string) string {
return path.Base(url)
}
func head(url string, path string, timeout time.Duration) (modified bool, err error) {
request, err := http.NewRequest("HEAD", url, nil)
if err != nil {
return
}
ctx, cancel_func := context.WithTimeout(context.Background(), timeout)
request = request.WithContext(ctx)
response, err := http.DefaultClient.Do(request)
if err != nil {
return
}
defer response.Body.Close()
if response.StatusCode == 404 {
cancel_func()
return false, nil
}
if response.StatusCode != 200 {
cancel_func()
return false, fmt.Errorf("%s: %s", url, response.Status)
}
lmh := response.Header.Get("Last-Modified")
lm, err := http.ParseTime(lmh)
if err != nil {
return true, err
}
file, err := os.Stat(path)
if err != nil {
return true, nil
}
clh := response.Header.Get("Content-Length")
cl, err := strconv.ParseInt(clh, 10, 0)
if err != nil {
return true, nil
}
if file.Size() != cl {
return true, nil
}
if file.ModTime().Before(lm) {
return true, nil
}
return false, nil
}
func get(url string, timeout time.Duration) (content []byte, err error) {
request, err := http.NewRequest("GET", url, nil)
if err != nil {
return
}
ctx, cancel_func := context.WithTimeout(context.Background(), timeout)
request = request.WithContext(ctx)
response, err := http.DefaultClient.Do(request)
if err != nil {
return
}
defer response.Body.Close()
if response.StatusCode != 200 {
cancel_func()
return nil, fmt.Errorf("%s: %s", url, response.Status)
}
return ioutil.ReadAll(response.Body)
}
func fileExists(path string) bool {
info, err := os.Stat(path)
if os.IsNotExist(err) {
return false
}
return !info.IsDir()
}
func download(url string, path string) (err error) {
for {
data, err := get(url, time.Second*10)
if err != nil {
return err
}
err = ioutil.WriteFile(path, data, 0644)
if err != nil {
return err
}
break
}
return nil
}
func unzip(src string, dest string) ([]string, error) {
var filenames []string
r, err := zip.OpenReader(src)
if err != nil {
return filenames, err
}
defer r.Close()
for _, f := range r.File {
fpath := filepath.Join(dest, f.Name)
if !strings.HasPrefix(fpath, filepath.Clean(dest)+string(os.PathSeparator)) {
return filenames, fmt.Errorf("%s: bad file path", fpath)
}
filenames = append(filenames, fpath)
if f.FileInfo().IsDir() {
os.MkdirAll(fpath, os.ModePerm)
continue
}
if err = os.MkdirAll(filepath.Dir(fpath), os.ModePerm); err != nil {
return filenames, err
}
outFile, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode())
if err != nil {
return filenames, err
}
rc, err := f.Open()
if err != nil {
return filenames, err
}
_, err = io.Copy(outFile, rc)
outFile.Close()
rc.Close()
if err != nil {
return filenames, err
}
}
return filenames, nil
}
func downloader(id int, jobs <-chan string, results chan<- string) {
for href := range jobs {
dir := filenameForUrl(href)
dest := filepath.Join(*outputDir, dir)
os.MkdirAll(dest, 0700)
ext := "zip"
url := href + "." + ext
name := filenameForUrl(url)
path := filepath.Join(*zipDir, name)
err := download(url, path)
if err != nil {
results <- fmt.Sprintf("%s: %s", url, err.Error())
continue
}
_, err = unzip(path, *outputDir)
if err != nil {
results <- fmt.Sprintf("%s: %s", url, err.Error())
continue
}
ext = "epub"
if strings.Contains(*formats, ext) {
url = href + "." + ext
name = filenameForUrl(url)
path = filepath.Join(dest, name)
err = download(url, path)
if err != nil {
results <- fmt.Sprintf("%s: %s", url, err.Error())
continue
}
}
ext = "lt.pdf"
if strings.Contains(*formats, ext) {
url = href + "." + ext
name = filenameForUrl(url)
path = filepath.Join(dest, name)
err = download(url, path)
if err != nil {
results <- fmt.Sprintf("%s: %s", url, err.Error())
continue
}
}
ext = "a4.pdf"
if strings.Contains(*formats, ext) {
url = href + "." + ext
name = filenameForUrl(url)
path = filepath.Join(dest, name)
err = download(url, path)
if err != nil {
results <- fmt.Sprintf("%s: %s", url, err.Error())
continue
}
}
ext = "pdf"
if strings.Contains(*formats, ext) {
url = href + "." + ext
name = filenameForUrl(url)
path = filepath.Join(dest, name)
err = download(url, path)
if err != nil {
results <- fmt.Sprintf("%s: %s", url, err.Error())
continue
}
}
if *verbose {
results <- fmt.Sprintf("%s: ok", href)
} else {
results <- ""
}
}
}
func checker(id int, jobs <-chan string, results chan<- string) {
for href := range jobs {
ext := "zip"
url := href + "." + ext
name := filenameForUrl(url)
path := filepath.Join(*zipDir, name)
modified, err := head(url, path, time.Second*10)
if err != nil {
fmt.Println(err)
results <- ""
continue
}
if modified {
results <- href
} else {
results <- ""
}
}
}
func save(store Store) {
storeData, _ := json.Marshal(&store)
_ = ioutil.WriteFile(*storeFile, storeData, 0644)
}