Skip to content

Commit 5258166

Browse files
authored
don't download non-HTML pages (#798)
* don't download non-HTML pages * fix timeouts and HTTP headers
1 parent 19304d6 commit 5258166

File tree

4 files changed

+64
-16
lines changed

4 files changed

+64
-16
lines changed

__utils__/mock-http-requests.js

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,11 @@ nock.enableNetConnect('127.0.0.1')
55

66
// Mock HTTP timeouts
77
;['https://timeout.com', 'www.timeout.com'].forEach(url => {
8-
nock(url).get('/').delay(1000000).reply(200, '<html></html>').persist()
8+
nock(url)
9+
.get('/')
10+
.delay(1_000_000)
11+
.reply(200, '<html></html>', { 'Content-Type': 'text/html' })
12+
.persist()
913
})
1014

1115
// For a couple of "stock" websites, prevent actually hitting them
@@ -31,7 +35,8 @@ nock.enableNetConnect('127.0.0.1')
3135
<body>
3236
<p>Hello!</p>
3337
</body>
34-
</html>`
38+
</html>`,
39+
{ 'Content-Type': 'text/html' }
3540
)
3641
.persist()
3742
})

lib/got.js

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
const got = require('got')
2+
3+
module.exports = got.default.extend({
4+
handlers: [
5+
(options, next) => {
6+
const promiseOrStream = next(options)
7+
8+
// A destroy function that supports both promises and streams.
9+
// For newer versions, we could use abortcontroller, but alas...
10+
const destroy = message => {
11+
if (options.isStream) {
12+
promiseOrStream.destroy(message)
13+
return
14+
}
15+
16+
// Also note that got v11 is a fucking troll and won't actually pass on the cancellation reason.
17+
promiseOrStream.cancel(message)
18+
}
19+
20+
promiseOrStream.on('response', response => {
21+
const contentType = response.headers['content-type']
22+
23+
// The goal is to not download *anything* if it's not HTML,
24+
// not only because we can't get metadata from non-HTML responses,
25+
// but also because non-HTML responses may cause us to download some gigantic payload.
26+
if (contentType && contentType.startsWith('text/html')) {
27+
options.context.requestLogger.info(
28+
`Received an HTML page. Returning response as-is.`
29+
)
30+
return
31+
}
32+
33+
options.context.requestLogger.info(
34+
`Received a non-HTML response. Aborting early.`
35+
)
36+
destroy('Not an HTML response')
37+
})
38+
39+
return promiseOrStream
40+
}
41+
]
42+
})

lib/scrape.js

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
const got = require('got')
21
const ms = require('ms')
32
const metascraper = require('metascraper')([
43
require('metascraper-author')(),
@@ -11,7 +10,8 @@ const metascraper = require('metascraper')([
1110
require('metascraper-title')()
1211
])
1312
const httpError = require('http-errors')
14-
const log = require('./logger')
13+
const got = require('./got')
14+
const logger = require('./logger')
1515

1616
// const nock = require('nock')
1717
// nock.disableNetConnect()
@@ -25,25 +25,25 @@ const log = require('./logger')
2525
const timeoutMs = ms(process.env.LINK_TIMEOUT)
2626

2727
module.exports = async url => {
28-
log.info(`Scraping %s for metadata...`, url)
28+
const requestLogger = logger.child({ url })
29+
requestLogger.info(`Scraping %s for metadata...`, url)
2930

3031
try {
31-
const promise = got(url, {
32-
timeout: { request: timeoutMs }
32+
const { body: html, url: finalUrl } = await got(url, {
33+
// Got is fucking stupid and this is the only way we can actually get the fucking timeouts to work.
34+
timeout: { socket: timeoutMs, request: timeoutMs },
35+
context: { requestLogger }
3336
})
34-
// TODO: just rely on got's built-in timeout once got v12 comes out
35-
setTimeout(() => {
36-
// At the moment, got's timeout doesn't work for shit
37-
promise.cancel()
38-
}, timeoutMs)
39-
40-
const { body: html, url: finalUrl } = await promise
4137
return metascraper({ html, url: finalUrl })
4238
} catch (err) {
4339
if (err.name === 'RequestError' && err.code === 'ENOTFOUND')
4440
throw httpError(404, 'The address to shorten does not exist!')
45-
if (err.name === 'CancelError')
41+
if (err.name === 'TimeoutError')
4642
throw httpError(504, 'Could not scrape link in time!')
43+
// If we were able to reach an actual thing at the other end,
44+
// but the request got canceled because it's not an HTML,
45+
// we don't care about it as we cannot get any useful metadata from the response.
46+
if (err.name === 'CancelError') return null
4747
else throw err
4848
}
4949
}

models/link.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,8 @@ class Link extends hashId(BaseModel) {
6666
await super.$beforeInsert(queryContext)
6767

6868
// update metadata by visiting the URL
69-
this.meta = merge(await scrape(this.originalUrl), this.meta)
69+
const scrapedMetadata = await scrape(this.originalUrl)
70+
this.meta = merge({}, this.meta, scrapedMetadata)
7071
}
7172

7273
static get virtualAttributes() {

0 commit comments

Comments
 (0)