Skip to content

Commit 08d74d8

Browse files
committed
performance and edge case handling improvements
+ new, better and cuter logging
1 parent ed1f031 commit 08d74d8

File tree

14 files changed

+512
-412
lines changed

14 files changed

+512
-412
lines changed

cmd/goop.go

+6-5
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
package cmd
22

33
import (
4-
"fmt"
4+
"os"
5+
56
"github.com/deletescape/goop/pkg/goop"
7+
"github.com/phuslu/log"
68
"github.com/spf13/cobra"
7-
"os"
89
)
910

1011
var force bool
@@ -21,12 +22,12 @@ var rootCmd = &cobra.Command{
2122
}
2223
if list {
2324
if err := goop.CloneList(args[0], dir, force, keep); err != nil {
24-
fmt.Fprintln(os.Stderr, err)
25+
log.Error().Err(err).Msg("exiting")
2526
os.Exit(1)
2627
}
2728
} else {
2829
if err := goop.Clone(args[0], dir, force, keep); err != nil {
29-
fmt.Fprintln(os.Stderr, err)
30+
log.Error().Err(err).Msg("exiting")
3031
os.Exit(1)
3132
}
3233
}
@@ -41,7 +42,7 @@ func init() {
4142

4243
func Execute() {
4344
if err := rootCmd.Execute(); err != nil {
44-
fmt.Fprintln(os.Stderr, err)
45+
log.Error().Err(err).Msg("exiting")
4546
os.Exit(1)
4647
}
4748
}

go.mod

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ require (
66
github.com/PuerkitoBio/goquery v1.6.0
77
github.com/go-git/go-billy/v5 v5.0.0
88
github.com/go-git/go-git/v5 v5.2.0
9+
github.com/phuslu/log v1.0.75
910
github.com/spf13/cobra v1.1.1
1011
github.com/valyala/fasthttp v1.16.0
1112
)

go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,8 @@ github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLA
164164
github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
165165
github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
166166
github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
167+
github.com/phuslu/log v1.0.75 h1:2Qcqgwo1sOsvj7QIuclIS92hmWxIISI2+XskYM1Nw2A=
168+
github.com/phuslu/log v1.0.75/go.mod h1:kzJN3LRifrepxThMjufQwS7S35yFAB+jAV1qgA7eBW4=
167169
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
168170
github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=
169171
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=

internal/jobtracker/jobtracker.go

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
package jobtracker
2+
3+
import (
4+
"sync"
5+
"sync/atomic"
6+
"time"
7+
)
8+
9+
type JobTracker struct {
10+
activeWorkers int32
11+
queuedJobs int32
12+
didWork bool
13+
cond *sync.Cond
14+
Queue chan string
15+
}
16+
17+
func Nap() {
18+
time.Sleep(40 * time.Millisecond)
19+
}
20+
21+
func NewJobTracker() *JobTracker {
22+
return &JobTracker{
23+
cond: sync.NewCond(&sync.Mutex{}),
24+
Queue: make(chan string, 999999), // TODO: dont create oversized queues, we should try to save memory; maybe read the channel docs again
25+
}
26+
}
27+
28+
func (jt *JobTracker) AddJob(job string) {
29+
// TODO: can we discard empty jobs here?
30+
jt.cond.L.Lock()
31+
atomic.AddInt32(&jt.queuedJobs, 1)
32+
jt.Queue <- job
33+
jt.cond.L.Unlock()
34+
}
35+
36+
func (jt *JobTracker) StartWork() {
37+
atomic.AddInt32(&jt.activeWorkers, 1)
38+
}
39+
40+
func (jt *JobTracker) EndWork() {
41+
jt.didWork = true
42+
atomic.AddInt32(&jt.activeWorkers, -1)
43+
atomic.AddInt32(&jt.queuedJobs, -1)
44+
}
45+
46+
func (jt *JobTracker) HasWork() bool {
47+
// TODO: didWork is a somewhat ugly workaround to ensure we dont exit before doing work at least once,
48+
// this will however result in locking up if we create a JobTracker but never queue any jobs
49+
hasWork := !jt.didWork || (atomic.LoadInt32(&jt.queuedJobs) > 0 && atomic.LoadInt32(&jt.activeWorkers) > 0)
50+
51+
if !hasWork {
52+
jt.cond.Broadcast()
53+
}
54+
return hasWork
55+
}
56+
57+
func (jt *JobTracker) Wait() {
58+
defer close(jt.Queue)
59+
60+
jt.cond.L.Lock()
61+
for jt.HasWork() {
62+
jt.cond.Wait()
63+
}
64+
}

internal/utils/html.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@ package utils
22

33
import (
44
"bytes"
5-
"github.com/PuerkitoBio/goquery"
65
"net/url"
76
"strings"
7+
8+
"github.com/PuerkitoBio/goquery"
89
)
910

1011
var htmlTag = []byte{'<', 'h', 't', 'm', 'l'}
@@ -36,5 +37,5 @@ func GetIndexedFiles(body []byte) ([]string, error) {
3637
}
3738
return true
3839
})
39-
return files, err
40+
return files, exitErr
4041
}

internal/utils/urls.go

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package utils
22

33
import "strings"
44

5+
//TODO: replace all uses of this with the proper path utils
56
func Url(base, path string) string {
67
return strings.TrimSuffix(base, "/") + "/" + strings.TrimPrefix(path, "/")
78
}

internal/workers/consts.go

-8
This file was deleted.

internal/workers/download.go

+58-50
Original file line numberDiff line numberDiff line change
@@ -1,65 +1,73 @@
11
package workers
22

33
import (
4-
"fmt"
5-
"github.com/deletescape/goop/internal/utils"
6-
"github.com/valyala/fasthttp"
74
"io/ioutil"
85
"os"
9-
"sync"
10-
"time"
6+
7+
"github.com/deletescape/goop/internal/jobtracker"
8+
"github.com/deletescape/goop/internal/utils"
9+
"github.com/phuslu/log"
10+
"github.com/valyala/fasthttp"
1111
)
1212

13-
func DownloadWorker(c *fasthttp.Client, queue chan string, baseUrl, baseDir string, wg *sync.WaitGroup, allowHtml bool) {
14-
defer wg.Done()
15-
var ctr int
13+
func DownloadWorker(c *fasthttp.Client, baseUrl, baseDir string, jt *jobtracker.JobTracker, allowHtml, allowEmpty bool) {
1614
for {
1715
select {
18-
case file := <-queue:
19-
20-
checkRatelimted()
21-
if file == "" {
22-
continue
23-
}
24-
targetFile := utils.Url(baseDir, file)
25-
if utils.Exists(targetFile) {
26-
fmt.Printf("%s was downloaded already, skipping\n", targetFile)
27-
continue
28-
}
29-
uri := utils.Url(baseUrl, file)
30-
code, body, err := c.Get(nil, uri)
31-
fmt.Printf("[-] Fetching %s [%d]\n", uri, code)
32-
if err != nil {
33-
fmt.Fprintf(os.Stderr, "error: %s\n", err)
34-
continue
35-
}
36-
if code == 200 {
37-
if !allowHtml && utils.IsHtml(body) {
38-
fmt.Printf("warning: %s appears to be an html file, skipping\n", uri)
39-
continue
40-
}
41-
if utils.IsEmptyBytes(body) {
42-
fmt.Printf("warning: %s appears to be an empty file, skipping\n", uri)
43-
continue
44-
}
45-
if err := utils.CreateParentFolders(targetFile); err != nil {
46-
fmt.Fprintf(os.Stderr, "error: %s\n", err)
47-
continue
48-
}
49-
if err := ioutil.WriteFile(targetFile, body, os.ModePerm); err != nil {
50-
fmt.Fprintf(os.Stderr, "error: %s\n", err)
51-
}
52-
} else if code == 429 {
53-
setRatelimited()
54-
queue <- file
55-
}
16+
case file := <-jt.Queue:
17+
downloadWork(c, baseUrl, baseDir, file, jt, allowHtml, allowEmpty)
5618
default:
57-
// TODO: get rid of dirty hack somehow
58-
if ctr >= graceTimes {
19+
if !jt.HasWork() {
5920
return
6021
}
61-
ctr++
62-
time.Sleep(gracePeriod)
22+
jobtracker.Nap()
6323
}
6424
}
6525
}
26+
27+
func downloadWork(c *fasthttp.Client, baseUrl, baseDir, file string, jt *jobtracker.JobTracker, allowHtml, allowEmpty bool) {
28+
jt.StartWork()
29+
defer jt.EndWork()
30+
31+
if file == "" {
32+
return
33+
}
34+
checkRatelimted()
35+
36+
targetFile := utils.Url(baseDir, file)
37+
if utils.Exists(targetFile) {
38+
log.Info().Str("file", targetFile).Msg("already fetched, skipping redownload")
39+
return
40+
}
41+
uri := utils.Url(baseUrl, file)
42+
code, body, err := c.Get(nil, uri)
43+
if err == nil && code != 200 {
44+
if code == 429 {
45+
setRatelimited()
46+
jt.AddJob(file)
47+
return
48+
}
49+
log.Warn().Str("uri", uri).Int("code", code).Msg("couldn't fetch file")
50+
return
51+
} else if err != nil {
52+
log.Error().Str("uri", uri).Int("code", code).Err(err).Msg("couldn't fetch file")
53+
return
54+
}
55+
56+
if !allowHtml && utils.IsHtml(body) {
57+
log.Warn().Str("uri", uri).Msg("file appears to be html, skipping")
58+
return
59+
}
60+
if !allowEmpty && utils.IsEmptyBytes(body) {
61+
log.Warn().Str("uri", uri).Msg("file appears to be empty, skipping")
62+
return
63+
}
64+
if err := utils.CreateParentFolders(targetFile); err != nil {
65+
log.Error().Str("uri", uri).Str("file", targetFile).Err(err).Msg("couldn't create parent directories")
66+
return
67+
}
68+
if err := ioutil.WriteFile(targetFile, body, os.ModePerm); err != nil {
69+
log.Error().Str("uri", uri).Str("file", targetFile).Err(err).Msg("clouldn't write file")
70+
return
71+
}
72+
log.Info().Str("uri", uri).Str("file", file).Msg("fetched file")
73+
}

0 commit comments

Comments
 (0)