Custom behaviour for Bluesky chunked js

I’m looking to crawl Bluesky, but have an issue with replay because it’s missing some chunked javascript files, which aren’t downloaded during the crawl.

You can reproduce this in Firefox with the ‘Save Page As…’ dialog. Opening the saved page you get a flash of content followed by a blank screen.

I found this list in the (minified) javascript running on bsky.app.

n.u = e => 'static/js/' + e + '.' + {
  19: 'd40abe57',
  39: '8226b52f',
  77: 'abec2799',
  82: 'b485c271',
  92: 'a4eb7114',
  101: 'b2bd00ad',
  108: '0faec14c',
  145: '733ab520',
  251: 'ee70bf06',
  262: '947ef69f',
  288: '3aa3c9a7',
  328: 'd9d8df10',
  348: '663ea3da',
  373: 'e48f7dba',
  418: '3904b460',
  419: '51d6af13',
  470: 'ada69b9e',
  491: 'b24b403b',
  508: '58cd084f',
  521: 'f001da5c',
  528: 'aa332f55',
  640: '6cb8e9a8',
  660: 'cb932a11',
  666: 'e86a3eb6',
  667: '061f2baf',
  697: 'b79c832e',
  703: '67ba7a99',
  742: 'a1ea8a2b',
  767: '277b5c8c',
  777: '3c1af201',
  789: '2eeeedce',
  792: 'bca82f4f',
  812: 'b08b8b5b',
  850: 'a2bf8641',
  871: 'dee844d8',
  874: '652c5fdc',
  889: '5145c908',
  905: '4df37e1f',
  930: '30d77223',
  931: 'ac811c09',
  936: 'e2b07d08',
  939: '599fa846',
  953: '42c43d79',
  977: '8d7d30a6',
  982: 'cf97360e'
}
[
  e
] + '.chunk.js'

This returns urls in the form https://web-cdn.bsky.app/static/js/<number>.<hash>.chunk.js, which are probably generated during a build process.

I’m not really able to get into building and reverse-engineering the Bluesky web app for now, so my solution is to just explicitly download those files when crawling any bluesky page.

class GetBlueskyExtraJs {

  static id = "Bluesky extra javascript";

  static isMatch() {
    return !!window.location.href.match(
      /https:\/\/bsky\.app\/.*/,
    );
  }

  static init() {
    return {
      state: {},
    };
  }

  async *run(ctx) {
    const { doExternalFetch } = ctx.Lib;
    const extra_javascript = ["https://web-cdn.bsky.app/static/js/19.d40abe57.chunk.js",
      "https://web-cdn.bsky.app/static/js/39.8226b52f.chunk.js",
      "https://web-cdn.bsky.app/static/js/77.abec2799.chunk.js",
      "https://web-cdn.bsky.app/static/js/82.b485c271.chunk.js",
      "https://web-cdn.bsky.app/static/js/92.a4eb7114.chunk.js",
      "https://web-cdn.bsky.app/static/js/101.b2bd00ad.chunk.js",
      "https://web-cdn.bsky.app/static/js/108.0faec14c.chunk.js",
      "https://web-cdn.bsky.app/static/js/145.733ab520.chunk.js",
      "https://web-cdn.bsky.app/static/js/251.ee70bf06.chunk.js",
      "https://web-cdn.bsky.app/static/js/262.947ef69f.chunk.js",
      "https://web-cdn.bsky.app/static/js/288.3aa3c9a7.chunk.js",
      "https://web-cdn.bsky.app/static/js/328.d9d8df10.chunk.js",
      "https://web-cdn.bsky.app/static/js/348.663ea3da.chunk.js",
      "https://web-cdn.bsky.app/static/js/373.e48f7dba.chunk.js",
      "https://web-cdn.bsky.app/static/js/418.3904b460.chunk.js",
      "https://web-cdn.bsky.app/static/js/419.51d6af13.chunk.js",
      "https://web-cdn.bsky.app/static/js/470.ada69b9e.chunk.js",
      "https://web-cdn.bsky.app/static/js/491.b24b403b.chunk.js",
      "https://web-cdn.bsky.app/static/js/508.58cd084f.chunk.js",
      "https://web-cdn.bsky.app/static/js/521.f001da5c.chunk.js",
      "https://web-cdn.bsky.app/static/js/528.aa332f55.chunk.js",
      "https://web-cdn.bsky.app/static/js/640.6cb8e9a8.chunk.js",
      "https://web-cdn.bsky.app/static/js/660.cb932a11.chunk.js",
      "https://web-cdn.bsky.app/static/js/666.e86a3eb6.chunk.js",
      "https://web-cdn.bsky.app/static/js/667.061f2baf.chunk.js",
      "https://web-cdn.bsky.app/static/js/697.b79c832e.chunk.js",
      "https://web-cdn.bsky.app/static/js/703.67ba7a99.chunk.js",
      "https://web-cdn.bsky.app/static/js/742.a1ea8a2b.chunk.js",
      "https://web-cdn.bsky.app/static/js/767.277b5c8c.chunk.js",
      "https://web-cdn.bsky.app/static/js/777.3c1af201.chunk.js",
      "https://web-cdn.bsky.app/static/js/789.2eeeedce.chunk.js",
      "https://web-cdn.bsky.app/static/js/792.bca82f4f.chunk.js",
      "https://web-cdn.bsky.app/static/js/812.b08b8b5b.chunk.js",
      "https://web-cdn.bsky.app/static/js/850.a2bf8641.chunk.js",
      "https://web-cdn.bsky.app/static/js/871.dee844d8.chunk.js",
      "https://web-cdn.bsky.app/static/js/874.652c5fdc.chunk.js",
      "https://web-cdn.bsky.app/static/js/889.5145c908.chunk.js",
      "https://web-cdn.bsky.app/static/js/905.4df37e1f.chunk.js",
      "https://web-cdn.bsky.app/static/js/930.30d77223.chunk.js",
      "https://web-cdn.bsky.app/static/js/931.ac811c09.chunk.js",
      "https://web-cdn.bsky.app/static/js/936.e2b07d08.chunk.js",
      "https://web-cdn.bsky.app/static/js/939.599fa846.chunk.js",
      "https://web-cdn.bsky.app/static/js/953.42c43d79.chunk.js",
      "https://web-cdn.bsky.app/static/js/977.8d7d30a6.chunk.js",
      "https://web-cdn.bsky.app/static/js/982.cf97360e.chunk.js"];
    for (const cdn_url of extra_javascript) {
      doExternalFetch(cdn_url);
    }
  }
}

I’m aware this isn’t a long-term solution because this hard-coded list will probably change next time Bluesky rolls out a new update.

Also, Javascript is not my speciality. This custom function works, but I’d appreciate comments or suggestions for improvement.

Hm, you shouldn’t have to do that.. I think the chunks change after updates to the UI, but not on every load.. It might be using some local storage session info though.

Have you tried crawling with Save Local and Session Storage enabled (in Browser settings)?

Oh, and I think this is only needed for logged in crawling, I think it works without that for non-logged in.

However, you may want a custom behavior to click on posts, images, play videos, etc… That’s usually what we use the custom behaviors for