Skip to content

Commit 21677e3

Browse files
committed
refactor scraper: enhance AnimefireClient with retry logic and challenge page detection; add unit tests for search functionality
1 parent 91f9bd7 commit 21677e3

File tree

2 files changed

+227
-52
lines changed

2 files changed

+227
-52
lines changed

internal/scraper/animefire.go

Lines changed: 135 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
package scraper
33

44
import (
5+
"errors"
56
"fmt"
67
"net/http"
78
"net/url"
@@ -18,9 +19,11 @@ const (
1819

1920
// AnimefireClient handles interactions with Animefire.plus
2021
type AnimefireClient struct {
21-
client *http.Client
22-
baseURL string
23-
userAgent string
22+
client *http.Client
23+
baseURL string
24+
userAgent string
25+
maxRetries int
26+
retryDelay time.Duration
2427
}
2528

2629
// NewAnimefireClient creates a new Animefire client
@@ -29,84 +32,164 @@ func NewAnimefireClient() *AnimefireClient {
2932
client: &http.Client{
3033
Timeout: 30 * time.Second,
3134
},
32-
baseURL: AnimefireBase,
33-
userAgent: UserAgent,
35+
baseURL: AnimefireBase,
36+
userAgent: UserAgent,
37+
maxRetries: 2,
38+
retryDelay: 350 * time.Millisecond,
3439
}
3540
}
3641

3742
// SearchAnime searches for anime on Animefire.plus using the original logic
3843
func (c *AnimefireClient) SearchAnime(query string) ([]*models.Anime, error) {
39-
searchURL := fmt.Sprintf("%s/pesquisar/%s", c.baseURL, url.QueryEscape(query))
44+
searchURL := fmt.Sprintf("%s/pesquisar/%s", c.baseURL, url.PathEscape(query))
4045

41-
req, err := http.NewRequest("GET", searchURL, nil)
42-
if err != nil {
43-
return nil, fmt.Errorf("failed to create request: %w", err)
46+
var lastErr error
47+
attempts := c.maxRetries + 1
48+
49+
for attempt := 0; attempt < attempts; attempt++ {
50+
req, err := http.NewRequest("GET", searchURL, nil)
51+
if err != nil {
52+
return nil, fmt.Errorf("failed to create request: %w", err)
53+
}
54+
55+
c.decorateRequest(req)
56+
57+
resp, err := c.client.Do(req)
58+
if err != nil {
59+
lastErr = fmt.Errorf("failed to make request: %w", err)
60+
if c.shouldRetry(attempt) {
61+
c.sleep()
62+
continue
63+
}
64+
return nil, lastErr
65+
}
66+
67+
if resp.StatusCode != http.StatusOK {
68+
lastErr = c.handleStatusError(resp)
69+
_ = resp.Body.Close()
70+
if c.shouldRetry(attempt) {
71+
c.sleep()
72+
continue
73+
}
74+
return nil, lastErr
75+
}
76+
77+
doc, err := goquery.NewDocumentFromReader(resp.Body)
78+
_ = resp.Body.Close()
79+
if err != nil {
80+
lastErr = fmt.Errorf("failed to parse HTML: %w", err)
81+
if c.shouldRetry(attempt) {
82+
c.sleep()
83+
continue
84+
}
85+
return nil, lastErr
86+
}
87+
88+
if c.isChallengePage(doc) {
89+
lastErr = errors.New("animefire returned a challenge page (try VPN or wait)")
90+
if c.shouldRetry(attempt) {
91+
c.sleep()
92+
continue
93+
}
94+
return nil, lastErr
95+
}
96+
97+
animes := c.extractSearchResults(doc)
98+
if len(animes) == 0 {
99+
// Legitimate empty result set – return without error
100+
return []*models.Anime{}, nil
101+
}
102+
103+
return animes, nil
104+
}
105+
106+
if lastErr != nil {
107+
return nil, lastErr
44108
}
109+
return nil, errors.New("failed to retrieve results from AnimeFire")
110+
}
45111

112+
func (c *AnimefireClient) decorateRequest(req *http.Request) {
46113
req.Header.Set("User-Agent", c.userAgent)
114+
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8")
115+
req.Header.Set("Accept-Language", "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7")
116+
req.Header.Set("Cache-Control", "no-cache")
117+
req.Header.Set("Pragma", "no-cache")
118+
req.Header.Set("Referer", c.baseURL+"/")
119+
}
47120

48-
resp, err := c.client.Do(req)
49-
if err != nil {
50-
return nil, fmt.Errorf("failed to make request: %w", err)
121+
func (c *AnimefireClient) handleStatusError(resp *http.Response) error {
122+
if resp.StatusCode == http.StatusForbidden {
123+
return fmt.Errorf("access restricted: VPN may be required")
51124
}
52-
defer func() { _ = resp.Body.Close() }()
125+
return fmt.Errorf("server returned: %s", resp.Status)
126+
}
53127

54-
if resp.StatusCode != http.StatusOK {
55-
if resp.StatusCode == http.StatusForbidden {
56-
return nil, fmt.Errorf("access restricted: VPN may be required")
57-
}
58-
return nil, fmt.Errorf("server returned: %s", resp.Status)
128+
func (c *AnimefireClient) shouldRetry(attempt int) bool {
129+
return attempt < c.maxRetries
130+
}
131+
132+
func (c *AnimefireClient) sleep() {
133+
if c.retryDelay <= 0 {
134+
return
135+
}
136+
time.Sleep(c.retryDelay)
137+
}
138+
139+
func (c *AnimefireClient) isChallengePage(doc *goquery.Document) bool {
140+
title := strings.ToLower(strings.TrimSpace(doc.Find("title").First().Text()))
141+
if strings.Contains(title, "just a moment") {
142+
return true
59143
}
60144

61-
doc, err := goquery.NewDocumentFromReader(resp.Body)
62-
if err != nil {
63-
return nil, fmt.Errorf("failed to parse HTML: %w", err)
145+
if doc.Find("#cf-wrapper").Length() > 0 || doc.Find("#challenge-form").Length() > 0 {
146+
return true
64147
}
65148

149+
body := strings.ToLower(doc.Text())
150+
return strings.Contains(body, "cf-error") || strings.Contains(body, "cloudflare")
151+
}
152+
153+
func (c *AnimefireClient) extractSearchResults(doc *goquery.Document) []*models.Anime {
66154
var animes []*models.Anime
67155

68-
// Use the same parsing logic as the original system
69156
doc.Find(".row.ml-1.mr-1 a").Each(func(i int, s *goquery.Selection) {
70157
if urlPath, exists := s.Attr("href"); exists {
71158
name := strings.TrimSpace(s.Text())
72159
if name != "" {
73-
fullURL := c.resolveURL(c.baseURL, urlPath)
74-
anime := &models.Anime{
160+
animes = append(animes, &models.Anime{
75161
Name: name,
76-
URL: fullURL,
77-
}
78-
animes = append(animes, anime)
162+
URL: c.resolveURL(c.baseURL, urlPath),
163+
})
79164
}
80165
}
81166
})
82167

83-
// If no results with the primary selector, try the card-based selector as fallback
84-
if len(animes) == 0 {
85-
doc.Find(".card_ani").Each(func(i int, s *goquery.Selection) {
86-
titleElem := s.Find(".ani_name a")
87-
title := strings.TrimSpace(titleElem.Text())
88-
link, exists := titleElem.Attr("href")
89-
90-
if exists && title != "" {
91-
// Get image URL
92-
imgElem := s.Find(".div_img img")
93-
imgURL, _ := imgElem.Attr("src")
94-
if imgURL != "" {
95-
imgURL = c.resolveURL(c.baseURL, imgURL)
96-
}
97-
98-
anime := &models.Anime{
99-
Name: title,
100-
URL: c.resolveURL(c.baseURL, link),
101-
ImageURL: imgURL,
102-
}
103-
104-
animes = append(animes, anime)
105-
}
106-
})
168+
if len(animes) > 0 {
169+
return animes
107170
}
108171

109-
return animes, nil
172+
doc.Find(".card_ani").Each(func(i int, s *goquery.Selection) {
173+
titleElem := s.Find(".ani_name a")
174+
title := strings.TrimSpace(titleElem.Text())
175+
link, exists := titleElem.Attr("href")
176+
177+
if exists && title != "" {
178+
imgElem := s.Find(".div_img img")
179+
imgURL, _ := imgElem.Attr("src")
180+
if imgURL != "" {
181+
imgURL = c.resolveURL(c.baseURL, imgURL)
182+
}
183+
184+
animes = append(animes, &models.Anime{
185+
Name: title,
186+
URL: c.resolveURL(c.baseURL, link),
187+
ImageURL: imgURL,
188+
})
189+
}
190+
})
191+
192+
return animes
110193
}
111194

112195
// resolveURL resolves relative URLs to absolute URLs

internal/scraper/animefire_test.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
package scraper
2+
3+
import (
4+
"fmt"
5+
"net/http"
6+
"net/http/httptest"
7+
"testing"
8+
9+
"github.com/stretchr/testify/assert"
10+
"github.com/stretchr/testify/require"
11+
)
12+
13+
func TestAnimefireSearchRetriesOnFailure(t *testing.T) {
14+
t.Parallel()
15+
16+
attempts := 0
17+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
18+
attempts++
19+
if attempts == 1 {
20+
w.WriteHeader(http.StatusBadGateway)
21+
return
22+
}
23+
24+
_, _ = fmt.Fprint(w, `
25+
<html>
26+
<body>
27+
<div class="row ml-1 mr-1">
28+
<a href="/anime/1">Naruto</a>
29+
</div>
30+
</body>
31+
</html>
32+
`)
33+
}))
34+
defer server.Close()
35+
36+
client := NewAnimefireClient()
37+
client.baseURL = server.URL
38+
client.maxRetries = 2
39+
client.retryDelay = 0
40+
41+
results, err := client.SearchAnime("naruto")
42+
require.NoError(t, err)
43+
require.Len(t, results, 1)
44+
45+
assert.Equal(t, "Naruto", results[0].Name)
46+
assert.Equal(t, server.URL+"/anime/1", results[0].URL)
47+
}
48+
49+
func TestAnimefireSearchReturnsEmptySliceWhenNoMatch(t *testing.T) {
50+
t.Parallel()
51+
52+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
53+
w.WriteHeader(http.StatusOK)
54+
_, _ = fmt.Fprint(w, `<html><body><div class="nothing-here"></div></body></html>`)
55+
}))
56+
defer server.Close()
57+
58+
client := NewAnimefireClient()
59+
client.baseURL = server.URL
60+
client.maxRetries = 1
61+
client.retryDelay = 0
62+
63+
results, err := client.SearchAnime("unknown")
64+
require.NoError(t, err)
65+
assert.Empty(t, results)
66+
}
67+
68+
func TestAnimefireSearchDetectsChallengePage(t *testing.T) {
69+
t.Parallel()
70+
71+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
72+
w.WriteHeader(http.StatusOK)
73+
_, _ = fmt.Fprint(w, `
74+
<html>
75+
<head><title>Just a moment...</title></head>
76+
<body>
77+
<div id="cf-wrapper">Blocked</div>
78+
</body>
79+
</html>
80+
`)
81+
}))
82+
defer server.Close()
83+
84+
client := NewAnimefireClient()
85+
client.baseURL = server.URL
86+
client.maxRetries = 1
87+
client.retryDelay = 0
88+
89+
_, err := client.SearchAnime("naruto")
90+
require.Error(t, err)
91+
assert.Contains(t, err.Error(), "challenge")
92+
}

0 commit comments

Comments
 (0)