Skip to content

Commit 4d2889c

Browse files
committed
feat: Working Code but Dirty Checker
Closes #1 Not really happy. there is still the other branches with better approach but not working. Here I am creating X collectors with 1 proxy by collector.
1 parent 2bdca4e commit 4d2889c

File tree

2 files changed

+47
-28
lines changed

2 files changed

+47
-28
lines changed

cfprxchecker/checker.go

Lines changed: 46 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
package cfprxchecker
22

33
import (
4+
"context"
45
"github.com/gocolly/colly"
56
"github.com/gocolly/colly/extensions"
67
"log"
78
"net/http"
9+
"net/url"
810
"os"
911
"strings"
1012
"sync"
@@ -21,6 +23,16 @@ type Args struct {
2123
muGood sync.Mutex
2224
}
2325

26+
// ProxyURL returns a proxy function (for use in a Transport)
27+
// that always returns the same URL.
28+
func ProxyURL(fixedURL *url.URL) func(*http.Request) (*url.URL, error) {
29+
return func(pr *http.Request) (*url.URL, error) {
30+
ctx := context.WithValue(pr.Context(), colly.ProxyURLKey, fixedURL.String())
31+
*pr = *pr.WithContext(ctx)
32+
return fixedURL, nil
33+
}
34+
}
35+
2436
func (a *Args) writeGoodProxy(proxy string) {
2537
if proxy != "" {
2638

@@ -39,9 +51,7 @@ func (a *Args) writeGoodProxy(proxy string) {
3951

4052
func (a *Args) writeBadProxy(proxy string) {
4153
if proxy != "" {
42-
4354
var b strings.Builder
44-
4555
a.muBad.Lock()
4656
defer a.muBad.Unlock()
4757

@@ -54,13 +64,7 @@ func (a *Args) writeBadProxy(proxy string) {
5464
}
5565
}
5666

57-
func CheckProxiesAgainstCloudFlare(args *Args) {
58-
// Rotate the proxies
59-
rp, err := RoundRobinProxySwitcher(args.ProxyList...)
60-
if err != nil {
61-
log.Fatal(err)
62-
}
63-
67+
func NewCollector(args *Args) *colly.Collector {
6468
// Instantiate default collector
6569
c := colly.NewCollector(
6670
colly.Async(true),
@@ -71,40 +75,55 @@ func CheckProxiesAgainstCloudFlare(args *Args) {
7175
c.CacheDir = ""
7276

7377
c.WithTransport(&http.Transport{
74-
Proxy: rp,
7578
DisableKeepAlives: true,
7679
MaxIdleConns: 100,
7780
MaxIdleConnsPerHost: 100,
7881
})
7982

80-
// Limit the maximum parallelism to 24
81-
err = c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 24})
82-
83-
if err != nil {
84-
log.Printf("error while doing c.Limit %s", err)
85-
}
83+
c.SetRequestTimeout(args.TimeoutProxy)
8684

8785
extensions.RandomUserAgent(c)
8886

89-
if args.BadProxiesOutputFile != nil {
90-
c.OnError(func(response *colly.Response, err error) {
91-
log.Printf("[DEBUG] Bad proxy found error status %d [%s] [%s]", response.StatusCode, response.Request.ProxyURL, err)
92-
args.writeBadProxy(response.Request.ProxyURL)
93-
})
87+
return c
88+
}
89+
90+
func CheckProxiesAgainstCloudFlare(args *Args) {
91+
var onErrorCallback = func(response *colly.Response, err error) {
92+
log.Printf("[DEBUG] Bad proxy found error status %d [%s] [%s]", response.StatusCode, response.Request.ProxyURL, err)
93+
args.writeBadProxy(response.Request.ProxyURL)
9494
}
9595

96-
c.OnResponse(func(response *colly.Response) {
96+
var onResponseCallback = func(response *colly.Response) {
9797
log.Printf("[DEBUG] Good proxy found [%s]", response.Request.ProxyURL)
9898
args.writeGoodProxy(response.Request.ProxyURL)
99-
})
99+
}
100+
101+
var collectors []*colly.Collector
100102

101103
for _, proxy := range args.ProxyList {
102104
log.Printf("[DEBUG] Doing %s", proxy)
103-
if err = c.Visit(args.WebsiteToCrawl); err != nil {
104-
log.Printf("Error happening doing Visit %s", err)
105+
newCollector := NewCollector(args)
106+
newCollector.OnResponse(onResponseCallback)
107+
newCollector.OnError(onErrorCallback)
108+
u, err := url.Parse(proxy)
109+
if err != nil {
110+
log.Printf("error while parseing proxy %s %s", proxy, err)
111+
newCollector = nil
112+
continue
113+
}
114+
newCollector.SetProxyFunc(ProxyURL(u))
115+
if err = newCollector.Visit(args.WebsiteToCrawl); err != nil {
116+
log.Printf("error happening doing Visit %s", err)
117+
newCollector = nil
118+
continue
119+
105120
}
121+
collectors = append(collectors, newCollector)
122+
log.Printf("[DEBUG] end Doing %v", len(collectors))
123+
}
124+
125+
for _, collec := range collectors {
126+
collec.Wait()
106127
}
107128

108-
// Wait until threads are finished
109-
c.Wait()
110129
}

main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ func main() {
1717
ProxyList string `arg:"positional,required" help:"path to the proxyList"`
1818
GoodProxiesPath string `default:"good.txt" help:"path to the good proxies identified"`
1919
BadProxiesPath string `default:"bad.txt" help:"path to the bad proxies identified"`
20-
TimeoutProxy int64 `default:"5" help:"timeout proxy duration"`
20+
TimeoutProxy int64 `default:"5" help:"timeout proxy duration in seconds"`
2121
}
2222

2323
arg.MustParse(&inputArgs)

0 commit comments

Comments
 (0)