don't download non-HTML pages (#798)

JaneJeon · web-flow · commit 52581667d542 · 2023-08-19T17:57:10.000+01:00
* don't download non-HTML pages

* fix timeouts and HTTP headers
diff --git a/__utils__/mock-http-requests.js b/__utils__/mock-http-requests.js
@@ -5,7 +5,11 @@ nock.enableNetConnect('127.0.0.1')
 
 // Mock HTTP timeouts
 ;['https://timeout.com', 'www.timeout.com'].forEach(url => {
-  nock(url).get('/').delay(1000000).reply(200, '<html></html>').persist()
+  nock(url)
+    .get('/')
+    .delay(1_000_000)
+    .reply(200, '<html></html>', { 'Content-Type': 'text/html' })
+    .persist()
 })
 
 // For a couple of "stock" websites, prevent actually hitting them
@@ -31,7 +35,8 @@ nock.enableNetConnect('127.0.0.1')
         <body>
           <p>Hello!</p>
         </body>
-      </html>`
+      </html>`,
+      { 'Content-Type': 'text/html' }
     )
     .persist()
 })
diff --git a/lib/got.js b/lib/got.js
@@ -0,0 +1,42 @@
+const got = require('got')
+
+module.exports = got.default.extend({
+  handlers: [
+    (options, next) => {
+      const promiseOrStream = next(options)
+
+      // A destroy function that supports both promises and streams.
+      // For newer versions, we could use abortcontroller, but alas...
+      const destroy = message => {
+        if (options.isStream) {
+          promiseOrStream.destroy(message)
+          return
+        }
+
+        // Also note that got v11 is a fucking troll and won't actually pass on the cancellation reason.
+        promiseOrStream.cancel(message)
+      }
+
+      promiseOrStream.on('response', response => {
+        const contentType = response.headers['content-type']
+
+        // The goal is to not download *anything* if it's not HTML,
+        // not only because we can't get metadata from non-HTML responses,
+        // but also because non-HTML responses may cause us to download some gigantic payload.
+        if (contentType && contentType.startsWith('text/html')) {
+          options.context.requestLogger.info(
+            `Received an HTML page. Returning response as-is.`
+          )
+          return
+        }
+
+        options.context.requestLogger.info(
+          `Received a non-HTML response. Aborting early.`
+        )
+        destroy('Not an HTML response')
+      })
+
+      return promiseOrStream
+    }
+  ]
+})
diff --git a/lib/scrape.js b/lib/scrape.js
@@ -1,4 +1,3 @@
-const got = require('got')
 const ms = require('ms')
 const metascraper = require('metascraper')([
   require('metascraper-author')(),
@@ -11,7 +10,8 @@ const metascraper = require('metascraper')([
   require('metascraper-title')()
 ])
 const httpError = require('http-errors')
-const log = require('./logger')
+const got = require('./got')
+const logger = require('./logger')
 
 // const nock = require('nock')
 // nock.disableNetConnect()
@@ -25,25 +25,25 @@ const log = require('./logger')
 const timeoutMs = ms(process.env.LINK_TIMEOUT)
 
 module.exports = async url => {
-  log.info(`Scraping %s for metadata...`, url)
+  const requestLogger = logger.child({ url })
+  requestLogger.info(`Scraping %s for metadata...`, url)
 
   try {
-    const promise = got(url, {
-      timeout: { request: timeoutMs }
+    const { body: html, url: finalUrl } = await got(url, {
+      // Got is fucking stupid and this is the only way we can actually get the fucking timeouts to work.
+      timeout: { socket: timeoutMs, request: timeoutMs },
+      context: { requestLogger }
     })
-    // TODO: just rely on got's built-in timeout once got v12 comes out
-    setTimeout(() => {
-      // At the moment, got's timeout doesn't work for shit
-      promise.cancel()
-    }, timeoutMs)
-
-    const { body: html, url: finalUrl } = await promise
     return metascraper({ html, url: finalUrl })
   } catch (err) {
     if (err.name === 'RequestError' && err.code === 'ENOTFOUND')
       throw httpError(404, 'The address to shorten does not exist!')
-    if (err.name === 'CancelError')
+    if (err.name === 'TimeoutError')
       throw httpError(504, 'Could not scrape link in time!')
+    // If we were able to reach an actual thing at the other end,
+    // but the request got canceled because it's not an HTML,
+    // we don't care about it as we cannot get any useful metadata from the response.
+    if (err.name === 'CancelError') return null
     else throw err
   }
 }
diff --git a/models/link.js b/models/link.js
@@ -66,7 +66,8 @@ class Link extends hashId(BaseModel) {
     await super.$beforeInsert(queryContext)
 
     // update metadata by visiting the URL
-    this.meta = merge(await scrape(this.originalUrl), this.meta)
+    const scrapedMetadata = await scrape(this.originalUrl)
+    this.meta = merge({}, this.meta, scrapedMetadata)
   }
 
   static get virtualAttributes() {

Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,8 @@ class Link extends hashId(BaseModel) {`
`66`	`66`	`await super.$beforeInsert(queryContext)`
`67`	`67`
`68`	`68`	`// update metadata by visiting the URL`
`69`		`- this.meta = merge(await scrape(this.originalUrl), this.meta)`
	`69`	`+ const scrapedMetadata = await scrape(this.originalUrl)`
	`70`	`+ this.meta = merge({}, this.meta, scrapedMetadata)`
`70`	`71`	`}`
`71`	`72`
`72`	`73`	`static get virtualAttributes() {`