Wikipedia:Database reports/Largely duplicative file names/Configuration

This report is updated every day.

Source code

edit
// released under public domain; MZMcBride, Tim Landscheidt, Deadbeef; 2011, 2013, 2023

use anyhow::Result;
use dbreps2::{Frequency, Report};
use mysql_async::prelude::Queryable;
use mysql_async::Conn;

pub struct DupeFileNames;

pub struct Row {
    norm_name: String,
    count: usize,
    orig_names_str: String,
}

impl Report<Row> for DupeFileNames {
    fn title(&self) -> &'static str {
        "Largely duplicative file names"
    }

    fn intro(&self) -> &'static str {
        "Largely duplicative file names (limited to the first 1000 entries)"
    }

    fn headings(&self) -> Vec<&'static str> {
        vec!["Normalized name", "Count", "Real names"]
    }

    fn frequency(&self) -> Frequency {
        Frequency::Daily
    }

    fn query(&self) -> &'static str {
        "
        /* dupefilenames.py SLOW_OK */
        SELECT
          LOWER(CONVERT(page_title USING utf8mb4)),
          GROUP_CONCAT(CONVERT(page_title USING utf8mb4) SEPARATOR '|'),
          COUNT(*)
        FROM page
        WHERE page_namespace = 6
        AND page_is_redirect = 0
        GROUP BY 1
        HAVING COUNT(*) > 1
        LIMIT 1000;
        "
    }

    async fn run_query(&self, conn: &mut Conn) -> Result<Vec<Row>> {
        Ok(conn
            .query_map(self.query(), |(norm_name, orig_names_str, count)| Row {
                norm_name,
                count,
                orig_names_str,
            })
            .await?)
    }

    fn format_row(&self, row: &Row) -> Vec<String> {
        vec![
            row.norm_name.clone(),
            row.count.to_string(),
            row.orig_names_str
                .split('|')
                .map(|x| format!("[[:File:{x}|{x}]]"))
                .collect::<Vec<_>>()
                .join(", "),
        ]
    }

    fn code(&self) -> &'static str {
        include_str!("dupefilenames.rs")
    }
}