I’m looking to crawl Bluesky, but have an issue with replay because it’s missing some chunked javascript files, which aren’t downloaded during the crawl.
You can reproduce this in Firefox with the ‘Save Page As…’ dialog. Opening the saved page you get a flash of content followed by a blank screen.
I found this list in the (minified) javascript running on bsky.app.
n.u = e => 'static/js/' + e + '.' + {
19: 'd40abe57',
39: '8226b52f',
77: 'abec2799',
82: 'b485c271',
92: 'a4eb7114',
101: 'b2bd00ad',
108: '0faec14c',
145: '733ab520',
251: 'ee70bf06',
262: '947ef69f',
288: '3aa3c9a7',
328: 'd9d8df10',
348: '663ea3da',
373: 'e48f7dba',
418: '3904b460',
419: '51d6af13',
470: 'ada69b9e',
491: 'b24b403b',
508: '58cd084f',
521: 'f001da5c',
528: 'aa332f55',
640: '6cb8e9a8',
660: 'cb932a11',
666: 'e86a3eb6',
667: '061f2baf',
697: 'b79c832e',
703: '67ba7a99',
742: 'a1ea8a2b',
767: '277b5c8c',
777: '3c1af201',
789: '2eeeedce',
792: 'bca82f4f',
812: 'b08b8b5b',
850: 'a2bf8641',
871: 'dee844d8',
874: '652c5fdc',
889: '5145c908',
905: '4df37e1f',
930: '30d77223',
931: 'ac811c09',
936: 'e2b07d08',
939: '599fa846',
953: '42c43d79',
977: '8d7d30a6',
982: 'cf97360e'
}
[
e
] + '.chunk.js'
This returns urls in the form https://web-cdn.bsky.app/static/js/<number>.<hash>.chunk.js, which are probably generated during a build process.
I’m not really able to get into building and reverse-engineering the Bluesky web app for now, so my solution is to just explicitly download those files when crawling any bluesky page.
class GetBlueskyExtraJs {
static id = "Bluesky extra javascript";
static isMatch() {
return !!window.location.href.match(
/https:\/\/bsky\.app\/.*/,
);
}
static init() {
return {
state: {},
};
}
async *run(ctx) {
const { doExternalFetch } = ctx.Lib;
const extra_javascript = ["https://web-cdn.bsky.app/static/js/19.d40abe57.chunk.js",
"https://web-cdn.bsky.app/static/js/39.8226b52f.chunk.js",
"https://web-cdn.bsky.app/static/js/77.abec2799.chunk.js",
"https://web-cdn.bsky.app/static/js/82.b485c271.chunk.js",
"https://web-cdn.bsky.app/static/js/92.a4eb7114.chunk.js",
"https://web-cdn.bsky.app/static/js/101.b2bd00ad.chunk.js",
"https://web-cdn.bsky.app/static/js/108.0faec14c.chunk.js",
"https://web-cdn.bsky.app/static/js/145.733ab520.chunk.js",
"https://web-cdn.bsky.app/static/js/251.ee70bf06.chunk.js",
"https://web-cdn.bsky.app/static/js/262.947ef69f.chunk.js",
"https://web-cdn.bsky.app/static/js/288.3aa3c9a7.chunk.js",
"https://web-cdn.bsky.app/static/js/328.d9d8df10.chunk.js",
"https://web-cdn.bsky.app/static/js/348.663ea3da.chunk.js",
"https://web-cdn.bsky.app/static/js/373.e48f7dba.chunk.js",
"https://web-cdn.bsky.app/static/js/418.3904b460.chunk.js",
"https://web-cdn.bsky.app/static/js/419.51d6af13.chunk.js",
"https://web-cdn.bsky.app/static/js/470.ada69b9e.chunk.js",
"https://web-cdn.bsky.app/static/js/491.b24b403b.chunk.js",
"https://web-cdn.bsky.app/static/js/508.58cd084f.chunk.js",
"https://web-cdn.bsky.app/static/js/521.f001da5c.chunk.js",
"https://web-cdn.bsky.app/static/js/528.aa332f55.chunk.js",
"https://web-cdn.bsky.app/static/js/640.6cb8e9a8.chunk.js",
"https://web-cdn.bsky.app/static/js/660.cb932a11.chunk.js",
"https://web-cdn.bsky.app/static/js/666.e86a3eb6.chunk.js",
"https://web-cdn.bsky.app/static/js/667.061f2baf.chunk.js",
"https://web-cdn.bsky.app/static/js/697.b79c832e.chunk.js",
"https://web-cdn.bsky.app/static/js/703.67ba7a99.chunk.js",
"https://web-cdn.bsky.app/static/js/742.a1ea8a2b.chunk.js",
"https://web-cdn.bsky.app/static/js/767.277b5c8c.chunk.js",
"https://web-cdn.bsky.app/static/js/777.3c1af201.chunk.js",
"https://web-cdn.bsky.app/static/js/789.2eeeedce.chunk.js",
"https://web-cdn.bsky.app/static/js/792.bca82f4f.chunk.js",
"https://web-cdn.bsky.app/static/js/812.b08b8b5b.chunk.js",
"https://web-cdn.bsky.app/static/js/850.a2bf8641.chunk.js",
"https://web-cdn.bsky.app/static/js/871.dee844d8.chunk.js",
"https://web-cdn.bsky.app/static/js/874.652c5fdc.chunk.js",
"https://web-cdn.bsky.app/static/js/889.5145c908.chunk.js",
"https://web-cdn.bsky.app/static/js/905.4df37e1f.chunk.js",
"https://web-cdn.bsky.app/static/js/930.30d77223.chunk.js",
"https://web-cdn.bsky.app/static/js/931.ac811c09.chunk.js",
"https://web-cdn.bsky.app/static/js/936.e2b07d08.chunk.js",
"https://web-cdn.bsky.app/static/js/939.599fa846.chunk.js",
"https://web-cdn.bsky.app/static/js/953.42c43d79.chunk.js",
"https://web-cdn.bsky.app/static/js/977.8d7d30a6.chunk.js",
"https://web-cdn.bsky.app/static/js/982.cf97360e.chunk.js"];
for (const cdn_url of extra_javascript) {
doExternalFetch(cdn_url);
}
}
}
I’m aware this isn’t a long-term solution because this hard-coded list will probably change next time Bluesky rolls out a new update.
Also, Javascript is not my speciality. This custom function works, but I’d appreciate comments or suggestions for improvement.