User:Skarz/WikiLinkChecker

Code

<quote>
import requests
import re
import mwclient
from urllib.parse import unquote, urlparse

def login_to_wikipedia(username, password):
    site = mwclient.Site('en.wikipedia.org')
    site.login(username, password)
    return site

def update_reference_with_archive(ref, archive_url):
    archive_date_match = re.search(r'web\.archive\.org/web/(\d{4})(\d{2})(\d{2})', archive_url)
    if archive_date_match:
        archive_date = f'{archive_date_match.group(1)}-{archive_date_match.group(2)}-{archive_date_match.group(3)}'
    else:
        archive_date = 'Unknown'
    updated_ref = re.sub(r'}}$', f'|archive-url={archive_url}|archive-date={archive_date}|url-status=dead}}', ref)
    return updated_ref

def check_link_status(url):
    try:
        response = requests.head(url, timeout=5)
        if response.status_code == 200:
            return 'Alive'
        else:
            return 'Dead'
    except requests.RequestException:
        return 'Dead'

def check_archive(url):
    archive_url = f'http://archive.org/wayback/available?url={url}'
    try:
        response = requests.get(archive_url, timeout=5)
        data = response.json()
        if data['archived_snapshots']:
            return 'Archived', data['archived_snapshots']['closest']['url']
        else:
            return 'Not Archived', 'N/A'
    except requests.RequestException:
        return 'Error Checking Archive', 'N/A'

def extract_page_name(input_text):
    if input_text.startswith('http'):
        url_parts = urlparse(input_text)
        page_name = unquote(url_parts.path.split('/')[-1])
    else:
        page_name = input_text
    return page_name

def main():
    username = input('Enter your Wikipedia username: ')
    password = input('Enter your Wikipedia password: ')
    site = login_to_wikipedia(username, password)

    input_text = input('Enter the Wikipedia page name or URL: ')
    page_name = extract_page_name(input_text)
    
    page = site.pages[page_name]
    page_content = page.text()

    cite_web_urls = re.findall(r'\{\{cite web[^}]*\|url=([^|}]+)', page_content, re.IGNORECASE)
    ref_urls = re.findall(r'<ref[^>]*>(?:[^<]*<a[^>]*href="([^"]+)"[^>]*>[^<]*</a>[^<]*)</ref>', page_content, re.IGNORECASE)
    all_urls = set(cite_web_urls + ref_urls)

    updated_content = page_content
    for url in all_urls:
        if 'archive.org' not in url:
            status = check_link_status(url)
            archive_status, archived_page = check_archive(url)
            if archive_status == 'Archived':
                updated_content = update_reference_with_archive(updated_content, archived_page)

    page.save(updated_content, summary='Updated dead links with archive URLs')

    print(f'Updated Wikipedia page: {page_name}')

if __name__ == '__main__':
    main()
</quote>