User:Polygnotus/Scripts/DeduplicateReferences.js

Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump.
This code will be executed when previewing this page.

Documentation for this user script can be added at User:Polygnotus/Scripts/DeduplicateReferences.

Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.

// <nowiki>
//Only exact duplicates
//Tries to come up with a name for the reference

// Function to deduplicate references in Wikipedia articles
function deduplicateReferences() {
  // Get the edit textarea and summary input
  const editTextarea = document.getElementById('wpTextbox1');
  const summaryInput = document.getElementById('wpSummary');
  if (!editTextarea || !summaryInput) return;

  let content = editTextarea.value;
  
  // Regular expression to match <ref> tags
  const refRegex = /<ref[^>]*>[\s\S]*?<\/ref>/gi;
  
  // Object to store all references
  const allRefs = {};
  
  // Set to store all used reference names
  const usedNames = new Set();
  
  // Blacklist of reference names to ignore
  const blacklist = [
    "doi_org",
    "jstor_org",
    "amazon_com",
    "books_google_com",
    "web_archive_org",
    "worldcat_org",
    "dx_doi_org"
    // Add more blacklisted names here
  ];
  
  // Function to extract domain name from URL
  function extractDomain(url) {
    try {
      let domain = new URL(url).hostname;
      domain = domain.replace(/^www\./, '');  // Remove 'www.' if present
      return domain === 'archive.org' ? extractDomain(url.split('archive.org/web/')[1]) : domain;
    } catch (e) {
      return null;
    }
  }
  
  // Function to generate a unique name for the reference
  function generateUniqueName(ref) {
    const urlMatch = ref.match(/https?:\/\/[^\s<>"]+/i);
    if (urlMatch) {
      const domain = extractDomain(urlMatch[0]);
      if (domain) {
        let baseName = domain.replace(/\./g, '_');
        let uniqueName = baseName;
        let counter = 1;
        while (usedNames.has(uniqueName)) {
          uniqueName = `${baseName}_${counter}`;
          counter++;
        }
        usedNames.add(uniqueName);
        return uniqueName;
      }
    }
    return null;
  }
  
  // Function to extract existing name from a reference
  function extractExistingName(ref) {
    const nameMatch = ref.match(/name\s*=\s*(["']?)([^"'\s/>]+(?:\s+[^"'\s/>]+)*)\1/i);
    return nameMatch ? nameMatch[2] : null;
  }
  
  // Function to create a reference tag
  function createRefTag(name, content = null) {
    if (content) {
      return `<ref name="${name}">${content}</ref>`;
    } else {
      return `<ref name="${name}" />`;
    }
  }
  
  // Function to check if a reference is blacklisted
  function isBlacklisted(ref) {
    const name = extractExistingName(ref);
    return name && blacklist.includes(name);
  }
  
  // First pass: collect all references and used names
  content.replace(refRegex, (match) => {
    if (!isBlacklisted(match)) {
      const existingName = extractExistingName(match);
      if (existingName) {
        usedNames.add(existingName);
      }
      if (allRefs[match]) {
        allRefs[match].count++;
      } else {
        allRefs[match] = { count: 1, name: existingName, firstOccurrence: match };
      }
    }
    return match;
  });
  
  // Second pass: replace duplicates with named references
  let deduplicatedCount = 0;
  content = content.replace(refRegex, (match) => {
    if (isBlacklisted(match)) {
      return match; // Return blacklisted references unchanged
    }
    if (allRefs[match] && allRefs[match].count > 1) {
      if (!allRefs[match].name) {
        // This is a duplicate without a name
        const generatedName = generateUniqueName(match);
        if (generatedName && !blacklist.includes(generatedName)) {
          allRefs[match].name = generatedName;
          allRefs[match].firstOccurrence = createRefTag(generatedName, match.match(/<ref[^>]*>([\s\S]*)<\/ref>/)[1]);
          return allRefs[match].firstOccurrence;
        }
      } else {
        // This is a named reference
        if (match === allRefs[match].firstOccurrence) {
          // This is the first occurrence, keep it as is
          return match;
        } else {
          // This is a subsequent occurrence, replace with short form
          deduplicatedCount++;
          return createRefTag(allRefs[match].name);
        }
      }
    }
    return match;  // Return unchanged for non-duplicates or blacklisted references
  });
  
  // Update the textarea with the deduplicated content
  if (deduplicatedCount > 0) {
    editTextarea.value = content;
    
    // Add edit summary
    let currentSummary = summaryInput.value;
    let deduplicationSummary = `Deduplicated ${deduplicatedCount} reference${deduplicatedCount > 1 ? 's' : ''}`;
    summaryInput.value = currentSummary ? `${currentSummary} • ${deduplicationSummary}` : deduplicationSummary;
    document.editform.wpMinoredit.checked = true;
  }
}

// Function to check if the edit textarea is ready
function isEditTextareaReady() {
  const editTextarea = document.getElementById('wpTextbox1');
  const summaryInput = document.getElementById('wpSummary');
  return editTextarea && editTextarea.value && summaryInput;
}

// Function to run deduplication when everything is ready
function runDeduplicationWhenReady() {
  if (isEditTextareaReady()) {
    deduplicateReferences();
  } else {
    // If not ready, check again after a short delay
    setTimeout(runDeduplicationWhenReady, 100);
  }
}

// Run the deduplication when the edit page is fully loaded
if (mw.config.get('wgAction') === 'edit') {
  if (document.readyState === 'complete') {
    runDeduplicationWhenReady();
  } else {
    window.addEventListener('load', runDeduplicationWhenReady);
  }
}


// </nowiki>