Skip to content

Commit b6b9638

Browse files
committed
Fix possible wrong encoding
1 parent 35b42e4 commit b6b9638

File tree

3 files changed

+14
-6
lines changed

3 files changed

+14
-6
lines changed

lib/index.js

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ class PuppeteerPlugin {
4747
await blockNavigation(page, url);
4848
}
4949

50-
await page.goto(url, this.gotoOptions);
50+
const puppeteerResponse = await page.goto(url, this.gotoOptions);
5151

5252
if (this.scrollToBottom) {
5353
await scrollToBottom(page, this.scrollToBottom.timeout, this.scrollToBottom.viewportN);
@@ -56,10 +56,12 @@ class PuppeteerPlugin {
5656
const content = await page.content();
5757
await page.close();
5858

59-
// convert utf-8 -> binary string because website-scraper needs binary
60-
return Buffer.from(content).toString('binary');
59+
const encoding = extractEncodingFromHeader(puppeteerResponse.headers())
60+
const body = Buffer.from(content).toString(encoding);
61+
62+
return { body, encoding }
6163
} else {
62-
return response.body;
64+
return { body: response.body }
6365
}
6466
});
6567

@@ -91,4 +93,10 @@ async function blockNavigation (page, url) {
9193
await page.setRequestInterception(true);
9294
}
9395

96+
function extractEncodingFromHeader (headers) {
97+
const contentTypeHeader = headers['content-type'];
98+
99+
return contentTypeHeader && contentTypeHeader.includes('utf-8') ? 'utf8' : 'binary';
100+
}
101+
94102
export default PuppeteerPlugin;

test/mock/index.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
* with cheerio and website-scraper itself.
1818
* See https://github.com/cheeriojs/cheerio/pull/2280
1919
*/
20-
document.getElementById('special-characters-test').innerText = '7년 동안 한국에서 살았어요. Слава Україні!';
20+
document.getElementById('special-characters-test').innerText = '7년 동안 한국에서 살았어요. Слава Україні! 磁致伸缩位移传感器 影响大跨度桥梁施工控制的因素';
2121
};
2222
</script>
2323

test/puppeteer-plugin.test.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ describe('Puppeteer plugin test', () => {
3939
});
4040

4141
it('should render special characters correctly', async () => {
42-
expect(content).to.contain('<div id="special-characters-test">7년 동안 한국에서 살았어요. Слава Україні!</div>');
42+
expect(content).to.contain('<div id="special-characters-test">7년 동안 한국에서 살았어요. Слава Україні! 磁致伸缩位移传感器 影响大跨度桥梁施工控制的因素</div>');
4343
});
4444
});
4545

0 commit comments

Comments
 (0)