All files / src/middleware no-index.js

80% Statements 24/30
72.22% Branches 13/18
100% Functions 7/7
87.5% Lines 21/24

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129                                                                            1x   1x     16x 16x       5x       1x                                     1x 1x     8x 8x 6x                               6x 6x 4x   6x                             2x 2x         1x 1x   1x           1x  
/**
 * Express middleware that locks down non-prod ShyTalk API hostnames
 * from search-engine indexing.
 *
 *   - Adds `X-Robots-Tag: noindex, nofollow, noarchive` to every
 *     response from non-prod hostnames so any URL exposed (e.g.
 *     `dev-api.shytalk.shyden.co.uk/api/firebase-config`) is dropped
 *     from search results.
 *   - Serves `/robots.txt` with `Disallow: /` on non-prod, and a
 *     standard "allow everything" response on prod.
 *
 * Detection is by `req.hostname` (Express-parsed Host header), NOT by
 * `process.env.NODE_ENV` — the dev VM may run with NODE_ENV=production
 * for pm2 reasons, so the hostname is the more reliable signal. The
 * single-source-of-truth `PROD_API_HOSTNAME` lives in
 * `functions/_lib/lockdown.js` so the web Pages middleware and this
 * Express middleware agree on what "prod" means.
 *
 * Does NOT add basic auth — the API is consumed by mobile apps that
 * embed the URL in their binary, and adding HTTP auth would break
 * their request flow. Discoverability via search is the actual leak,
 * and noindex closes that.
 */
 
// NOTE: helpers are inlined below rather than required from
// `../../../functions/_lib/lockdown.js` because the deploy workflow's
// backend tarball is built with `cd express-api && tar czf api.tar.gz .`
// and cannot reach files outside that directory. The first deploy of
// this middleware (run 25276782190) crashed the dev API at startup
// with `MODULE_NOT_FOUND: ../../functions/_lib/lockdown.js`. The
// Cloudflare Pages middleware in `functions/_middleware.js` continues
// to use the shared `_lib/lockdown.js` because Pages deploys it from
// the `functions/` directory at the project root (sibling of `public/`).
//
// Drift risk between the two copies is mitigated by `dev-lockdown-middleware.test.js`
// pinning the rules from one side and `no-index.test.js` pinning the
// integration on the other; if the constants ever diverge the dev API
// or Pages function behaviour will fail those tests.
const log = require('../utils/log');
 
const PROD_API_HOSTNAME = 'api.shytalk.shyden.co.uk';
 
function isProdApiHostname(hostname) {
  Iif (typeof hostname !== 'string' || hostname.length === 0) return false;
  return hostname.toLowerCase() === PROD_API_HOSTNAME;
}
 
function noIndexHeaderValue() {
  return 'noindex, nofollow, noarchive';
}
 
function blockingRobotsBody() {
  return [
    '# Non-prod ShyTalk environment — blocked from indexing.',
    '# See express-api/src/middleware/no-index.js (and',
    '# functions/_lib/lockdown.js for the web side).',
    'User-agent: *',
    'Disallow: /',
    '',
  ].join('\n');
}
 
// Hostnames we expect to see at runtime. Anything outside this set is a
// misconfiguration — most likely Caddy / a future reverse-proxy edit
// dropped or rewrote the Host header. We log a warning ONCE per unseen
// hostname so the misclassification surfaces in logs instead of silently
// shipping noindex on prod (or, worse, NOT shipping noindex on a
// dev hostname that doesn't match our pattern).
//
// Memoised in module scope so the warning fires at most once per process
// per hostname — without memoisation a misconfigured proxy floods logs.
const KNOWN_NON_PROD_HOSTNAME_PATTERNS = [/^dev-api\./i, /^localhost$/i, /^127\.0\.0\.1$/];
const seenUnexpectedHostnames = new Set();
 
function warnIfUnexpectedHostname(hostname) {
  Iif (typeof hostname !== 'string' || hostname.length === 0) return;
  if (isProdApiHostname(hostname)) return;
  Eif (KNOWN_NON_PROD_HOSTNAME_PATTERNS.some((p) => p.test(hostname))) return;
  if (seenUnexpectedHostnames.has(hostname)) return;
  seenUnexpectedHostnames.add(hostname);
  log.warn(
    'no-index',
    'Unexpected hostname classified non-prod — verify reverse proxy is forwarding the public Host header. ' +
      'Misclassification could ship X-Robots-Tag: noindex on prod or break dev SEO defence.',
    { hostname, expectedProd: PROD_API_HOSTNAME },
  );
}
 
/**
 * Mounted via `app.use(noIndex)` BEFORE any other route so the header
 * is set on every downstream response (errors, 404s, etc.).
 */
function noIndex(req, res, next) {
  warnIfUnexpectedHostname(req.hostname);
  if (!isProdApiHostname(req.hostname)) {
    res.set('X-Robots-Tag', noIndexHeaderValue());
  }
  next();
}
 
/**
 * Mounted as `app.get('/robots.txt', robotsTxt)`. Returns:
 *   - non-prod: `Disallow: /` (block all crawlers)
 *   - prod: a permissive `Allow: /` body
 *
 * The endpoint is intentionally NOT inside the `/api` prefix because
 * crawlers fetch `https://<host>/robots.txt` at the root.
 */
function robotsTxt(req, res) {
  // /robots.txt is the highest-impact silent-failure surface — a
  // misclassified hostname here would deindex prod. Warn loudly the
  // first time we see an unexpected hostname.
  warnIfUnexpectedHostname(req.hostname);
  if (isProdApiHostname(req.hostname)) {
    // Prod robots.txt — permissive. Strictly speaking the API root
    // hostname doesn't need a robots.txt at all (no HTML to index),
    // but serving an explicit allow makes the file present and
    // documents intent.
    res.type('text/plain; charset=utf-8').send(['User-agent: *', 'Allow: /', ''].join('\n'));
    return;
  }
  res
    .type('text/plain; charset=utf-8')
    .set('X-Robots-Tag', noIndexHeaderValue())
    .send(blockingRobotsBody());
}
 
module.exports = { noIndex, robotsTxt };