All files / src/utils similarity.js

100% Statements 37/37
100% Branches 36/36
100% Functions 3/3
100% Lines 26/26

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78                        119x                         83x       59x 59x 59x 53x   50x 3x     47x 1730x 1980x   47x 1683x 524084x 524084x               47x                     54x 54x 54x 52x 49x   33x 33x 33x 33x     83x  
/**
 * Text similarity utilities for duplicate detection and blocked topic matching.
 *
 * Uses normalised Levenshtein distance to compute similarity ratio (0-1).
 */
 
/**
 * Normalise text for comparison: lowercase, strip punctuation, collapse whitespace.
 * @param {string} text
 * @returns {string}
 */
function normalise(text) {
  return (text || '')
    .toLowerCase()
    .replace(/[^\w\s\u00C0-\u024F\u0400-\u04FF\u0600-\u06FF\u3000-\u9FFF\uAC00-\uD7AF]/g, '')
    .replace(/\s+/g, ' ')
    .trim();
}
 
/**
 * Compute Levenshtein edit distance between two strings.
 * @param {string} a
 * @param {string} b
 * @returns {number}
 */
const MAX_EDIT_DISTANCE_LEN = 500;
 
function editDistance(a, b) {
  // Coerce to strings to prevent type confusion if called with non-string input
  a = typeof a === 'string' ? a : String(a ?? '');
  b = typeof b === 'string' ? b : String(b ?? '');
  if (a.length === 0) return b.length;
  if (b.length === 0) return a.length;
  // Prevent CPU exhaustion with very long strings (O(n*m) algorithm)
  if (a.length > MAX_EDIT_DISTANCE_LEN || b.length > MAX_EDIT_DISTANCE_LEN) {
    return a === b ? 0 : Math.max(a.length, b.length);
  }
 
  const matrix = [];
  for (let i = 0; i <= b.length; i++) matrix[i] = [i];
  for (let j = 0; j <= a.length; j++) matrix[0][j] = j;
 
  for (let i = 1; i <= b.length; i++) {
    for (let j = 1; j <= a.length; j++) {
      const cost = b[i - 1] === a[j - 1] ? 0 : 1;
      matrix[i][j] = Math.min(
        matrix[i - 1][j] + 1,
        matrix[i][j - 1] + 1,
        matrix[i - 1][j - 1] + cost,
      );
    }
  }
 
  return matrix[b.length][a.length];
}
 
/**
 * Compute similarity ratio between two strings (0 = completely different, 1 = identical).
 * Uses normalised Levenshtein distance.
 * @param {string} a
 * @param {string} b
 * @returns {number}
 */
function similarity(a, b) {
  const sa = normalise(a);
  const sb = normalise(b);
  if (!sa && !sb) return 1;
  if (!sa || !sb) return 0;
  if (sa === sb) return 1;
 
  const longer = sa.length >= sb.length ? sa : sb;
  const shorter = sa.length >= sb.length ? sb : sa;
  const dist = editDistance(longer, shorter);
  return (longer.length - dist) / longer.length;
}
 
module.exports = { similarity, normalise, editDistance };