|
|
| Line 1: |
Line 1: |
| <syntaxhighlight lang="python" line="">
| | Ask hkm for the program if you're interested in helping out with the project! |
| """
| |
| Wiki Rating Processor
| |
| | |
| This script processes community ratings from a talk page and merges them with main list entries
| |
| from a xenharmonic wiki. Results are formatted into a sorted wikitable and copied to clipboard.
| |
| | |
| disclaimer: a lot of this is ai code, because I don't have this much time on my hands
| |
| """
| |
| | |
| import re
| |
| import requests
| |
| from bs4 import BeautifulSoup
| |
| import pyperclip
| |
| | |
| # Regular expressions precompiled for better performance
| |
| WIKI_LINK_PATTERN = re.compile(r'\[\[(.*?)(?:\|(.*?))?\]\]')
| |
| EXTERNAL_LINK_PATTERN = re.compile(r'\[(https?://\S+)\s+(.*)\]')
| |
| TALK_PAGE_PATTERN = re.compile(r'^([^:]+):\s+(.*)\.\s+([0-9.]+)$')
| |
| | |
| # Constants for input validation
| |
| MAX_AUTHOR_LENGTH = 40
| |
| MAX_DESCRIPTION_LENGTH = 100
| |
| VALID_RATING_RANGE = (0, 5)
| |
| WIKITABLE_HEADER = '''{| class="wikitable sortable" style="margin: auto; max-width: 800px; width: 100%;"
| |
| |+Xenharmonic works sorted by community ranking. The "R" column is the rating of the work, \
| |
| and the "#" column is the number of ratings given to that work.
| |
| ! Creator !! Work !! Tuning !! Notes !! R !! #
| |
| | |
| '''
| |
| | |
| DEFAULT_ENTRY = {
| |
| 'tuning': '',
| |
| 'notes': '',
| |
| 'popularity': '0',
| |
| 'link': ''
| |
| }
| |
| | |
| def format_link(text: str) -> str:
| |
| """Apply special formatting to each link in the string individually."""
| |
| text = remove_formatting(text)
| |
| # Process internal links to underline display text
| |
| def replace_internal(match):
| |
| content = match.group(1)
| |
| parts = content.split('|', 1)
| |
| if len(parts) == 1:
| |
| target = parts[0].strip()
| |
| #return f'[[{target}|<u>{target}</u>]]'
| |
| return f'<u>[[{target}]]</u>'
| |
| else:
| |
| target, display = parts
| |
| target = target.strip()
| |
| display = display.strip()
| |
| return f'<u>[[{target}|{display}]]</u>'
| |
|
| |
| processed_text = re.sub(r'\[\[(.*?)\]\]', replace_internal, text, flags=re.DOTALL)
| |
|
| |
| # Process external links according to rules
| |
| def replace_external(match):
| |
| content = match.group(1)
| |
| parts = content.split(' ', 1)
| |
| url_part = parts[0].strip().lower()
| |
| reconstructed = parts[0].strip() + (f' {parts[1].strip()}' if len(parts) > 1 else '')
| |
|
| |
| if 'bandcamp' in url_part:
| |
| return f'[{content}]'
| |
| elif 'youtube' in url_part:
| |
| return f"''[{reconstructed}]''"
| |
| else:
| |
| return f'[{url_part} <nowiki>[{parts[1].strip()}]</nowiki>]'
| |
|
| |
| processed_text = re.sub(r'(?<!\[)\[(?!\[)(.*?)\]', replace_external, processed_text, flags=re.DOTALL)
| |
|
| |
| return processed_text
| |
| | |
| def process_wiki_links(text: str) -> str:
| |
| """Process wiki links and external links to extract display text."""
| |
| # Process internal wiki links
| |
| text = WIKI_LINK_PATTERN.sub(lambda m: m.group(2) or m.group(1), text)
| |
| # Process external links to extract display text
| |
| text = EXTERNAL_LINK_PATTERN.sub(
| |
| lambda m: m.group(2).strip() if m.group(2) else '',
| |
| text
| |
| )
| |
| return text
| |
| | |
| def remove_formatting(text: str) -> str:
| |
| """Remove wiki formatting, HTML tags, and paired <nowiki> tags."""
| |
| # Corrected regex to match <nowiki>[content]</nowiki>
| |
| text = re.sub(r'<nowiki>\[(.*?)\]</nowiki>', r'\1', text, flags=re.DOTALL)
| |
| # Remove any remaining HTML tags
| |
| text = re.sub(r'<\/?[a-z]+>', '', text)
| |
| # Remove bold, italic, and underline
| |
| text = re.sub(r"'''(.*?)'''", r'\1', text)
| |
| text = re.sub(r"''(.*?)''", r'\1', text)
| |
| text = re.sub(r"__(.*?)__", r'\1', text)
| |
| return text.strip()
| |
| | |
| """TALK"""
| |
| | |
| def process_talk_page(url: str) -> dict:
| |
| """Process talk page entries to calculate community ratings.
| |
| Returns a dict from (author, work) to its list of ratings."""
| |
| ratings = {}
| |
| try:
| |
| response = requests.get(url)
| |
| response.raise_for_status()
| |
| soup = BeautifulSoup(response.text, 'html.parser')
| |
| if not (textarea := soup.find('textarea', {'id': 'wpTextbox1'})):
| |
| print("Error: Could not find talk page content")
| |
| return {}
| |
| | |
| for line in textarea.text.split('\n'):
| |
| if match := TALK_PAGE_PATTERN.match(line.strip()):
| |
| author, description, rating_str = match.groups()
| |
| if len(author) <= MAX_AUTHOR_LENGTH and \
| |
| len(description) <= MAX_DESCRIPTION_LENGTH:
| |
| process_rating(author, description, rating_str, ratings)
| |
| | |
| except requests.RequestException as e:
| |
| print(f"Network error: {e}")
| |
| return ratings
| |
| | |
| def process_rating(author: str, desc: str, rating_str: str, ratings: dict):
| |
| """Process and validate a single rating entry."""
| |
| try:
| |
| rating = float(rating_str)
| |
| if VALID_RATING_RANGE[0] <= rating <= VALID_RATING_RANGE[1]:
| |
| key = (author, desc)
| |
| ratings.setdefault(key, []).append(rating)
| |
| except ValueError:
| |
| pass
| |
| | |
| def process_main_page(url: str) -> tuple:
| |
| """Process main list page entries."""
| |
| try:
| |
| response = requests.get(url)
| |
| response.raise_for_status()
| |
| soup = BeautifulSoup(response.text, 'html.parser')
| |
| if not (textarea := soup.find('textarea', id='wpTextbox1')):
| |
| print("Error: Could not find main page content")
| |
| return {}, '', ''
| |
| | |
| raw_text = textarea.get_text()
| |
| before, table_section, after = parse_table_sections(raw_text)
| |
| return process_table_entries(table_section), before, update_last_modified(after)
| |
| | |
| except (requests.RequestException, ValueError) as e:
| |
| print(f"Error: {e}")
| |
| return {}, '', ''
| |
| | |
| def parse_table_sections(text: str) -> tuple:
| |
| """Split text into sections around the wikitable."""
| |
| try:
| |
| # Split into parts before and after the table
| |
| before_table, table_plus_after = text.split('{|', 1)
| |
| table_content, after_table = table_plus_after.split('|}', 1)
| |
| return before_table, table_content.strip(), after_table
| |
| except ValueError as e:
| |
| print(f"Table parsing error: {e}")
| |
| return "", "", text
| |
| | |
| def update_last_modified(content: str) -> str:
| |
| print(content)
| |
| """Update last modified timestamp."""
| |
| return (
| |
| "\nLast updated by ~~~~.\n" + content[content.find("Last updated by"):].split('\n', 1)[1]
| |
| if content.lstrip().startswith("Last updated by")
| |
| else "\nLast updated by ~~~~.\n" + content
| |
| )
| |
| | |
| def process_table_entries(table_section: str) -> dict:
| |
| """Process table entries into structured data."""
| |
| entries = {}
| |
| for entry in re.split(r'\|\|-\s*', table_section.replace('\n|', '||')):
| |
| if entry.strip().startswith('|'):
| |
| process_single_entry(entry, entries)
| |
| return entries
| |
| | |
| def process_single_entry(entry: str, entries: dict):
| |
| """Process individual table entry."""
| |
| parts = [p.strip() for p in re.split(r'\s*\|\|\s*', entry.lstrip('|'))]
| |
| if len(parts) >= 6 and parts[-1] != "}":
| |
| author = parts[0]
| |
| work_link = parts[1]
| |
| entry_data = {
| |
| 'tuning': parts[2],
| |
| 'notes': parts[3],
| |
| 'popularity': parts[5],
| |
| 'link': work_link
| |
| }
| |
| # update_entries(author, extract_display_text(work_link), entry_data, entries)
| |
| update_entries(author, remove_formatting(process_wiki_links(work_link)),
| |
| entry_data, entries)
| |
| | |
| def update_entries(author: str, work: str, new_entry: dict, entries: dict):
| |
| """Update entries with conflict resolution."""
| |
| norm_key = (author.lower(), work.lower())
| |
| existing = next((k for k in entries if (k[0].lower(), k[1].lower()) == norm_key), None)
| |
|
| |
| if existing:
| |
| if should_replace(existing, new_entry, entries):
| |
| del entries[existing]
| |
| entries[(author, work)] = new_entry
| |
| else:
| |
| entries[(author, work)] = new_entry
| |
| | |
| def should_replace(existing_key: tuple, new_entry: dict, entries: dict) -> bool:
| |
| """Determine if new entry should replace existing one."""
| |
| try:
| |
| current_pop = int(entries[existing_key]['popularity'])
| |
| new_pop = int(new_entry['popularity'])
| |
| return new_pop > current_pop
| |
| except (KeyError, ValueError):
| |
| return True
| |
| | |
| def build_output(before: str, after: str, averages: dict, main_data: dict) -> str:
| |
| """Build final output using rating lists directly."""
| |
| output = [before, WIKITABLE_HEADER]
| |
| | |
| sorted_items = sorted(
| |
| averages.items(), # a tuple pair ((author, work), [list_of_ratings])
| |
| key=lambda x: (-(sum(x[1])+5) / (len(x[1])+2) ), # Descending by average
| |
| )
| |
|
| |
| for (author, work), ratings in sorted_items:
| |
| avg = sum(ratings) / len(ratings)
| |
| count = len(ratings)
| |
| entry = main_data.get((author, work), {**DEFAULT_ENTRY, 'link': work})
| |
| output.append(format_row(author, entry, avg, count))
| |
|
| |
| # Add unrated entries
| |
| for (author, work) in set(main_data) - set(averages):
| |
| entry = main_data[(author, work)]
| |
| output.append(format_row(author, entry))
| |
|
| |
| return ''.join(output + ["|}\n", after])
| |
| | |
| def format_row(author: str, entry: dict, rating: float = None, count: int = None) -> str:
| |
| """Format row using calculated values when available."""
| |
| rating_str = f"{rating:.2f}" if rating is not None else ""
| |
| count_str = str(count) if count is not None else ""
| |
| return (f"|-\n| {author} || {format_link(entry['link'])} || {entry['tuning']} || "
| |
| f"{entry['notes']} || {rating_str} || {count_str}\n")
| |
| | |
| def main():
| |
| """Main processing workflow."""
| |
| talk_url = "https://en.xen.wiki/index.php?title=User_talk:Hkm/Rankings&action=edit"
| |
| main_url = "https://en.xen.wiki/index.php?title=List_of_xenharmonic_music_by_community_ratings&action=edit"
| |
| | |
| averages = process_talk_page(talk_url)
| |
| main_data, before, after = process_main_page(main_url)
| |
|
| |
| if main_data:
| |
| pyperclip.copy(build_output(before, after, averages, main_data))
| |
| print("Results copied to clipboard")
| |
| | |
| n = "Namoic: [https://benyamind.bandcamp.com/track/chromacro-17-edo <nowiki>[Chromacro]</nowiki>]"
| |
| print(n + '\n' + format_link(n) + '\n')
| |
| | |
| n = "[https://soundcloud.com/jollybard/pop-song <nowiki>[pop song]</nowiki>]"
| |
| print(n + '\n' + format_link(n) + '\n')
| |
| | |
| n = "Harmony Hacker: <u>[[Gleam]]</u>"
| |
| print(n + '\n' + format_link(n) + '\n')
| |
| | |
| if __name__ == "__main__":
| |
| main()
| |
| </syntaxhighlight>
| |