From c2cd6f451ed340d3ed9aff1ce963ebc52a9c709c Mon Sep 17 00:00:00 2001 From: bskjon Date: Mon, 30 Dec 2024 04:14:28 +0100 Subject: [PATCH] Updated metadata --- apps/pyMetadata/DryRun.py | 50 ++++++++++++++----------- apps/pyMetadata/algo/AdvancedMatcher.py | 22 ++++++++--- apps/pyMetadata/app.py | 11 +++--- 3 files changed, 50 insertions(+), 33 deletions(-) diff --git a/apps/pyMetadata/DryRun.py b/apps/pyMetadata/DryRun.py index 4da5a88b..8abacd27 100644 --- a/apps/pyMetadata/DryRun.py +++ b/apps/pyMetadata/DryRun.py @@ -6,7 +6,7 @@ from typing import List, Optional import uuid import threading import json -import time +import time, asyncio from fuzzywuzzy import fuzz from algo.AdvancedMatcher import AdvancedMatcher @@ -15,6 +15,7 @@ from algo.PrefixMatcher import PrefixMatcher from clazz.Metadata import Metadata from clazz.shared import EventData, EventMetadata, MediaEvent +from app import MetadataEventHandler from sources.anii import Anii from sources.imdb import Imdb from sources.mal import Mal @@ -56,32 +57,37 @@ else: class DryRun(): - titles: List[str] = [] + searchTitles: List[str] = [] + title: str + sanitizedName: str - def __init__(self, titles: List[str]) -> None: - self.titles = titles + def __init__(self, title: str, sanitizedName: str, searchTitles: List[str]) -> None: + self.title = title + self.sanitizedName = sanitizedName + self.searchTitles = searchTitles def run(self) -> None: - combined_titles = ", ".join(self.titles) - logger.info("Searching for %s", combined_titles) - result: Metadata | None = self.__getMetadata(self.titles) - - message: str | None = None - if (result is None): - message = f"No result for {combined_titles}" - logger.info(message) - - message = MediaEvent( - metadata = EventMetadata( - referenceId="00000000-0000-0000-0000-000000000000", - eventId="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX", - derivedFromEventId=None, - status= "Failed" if result is None else "Success", + evnet = MediaEvent( + metadata=EventMetadata( + derivedFromEventId="ccccccccc-cccc-cccc-cccc-cccccccccccc", + eventId="eeeeeeeee-eeee-eeee-eeee-eeeeeeeeeeee", + referenceId="rrrrrrrrr-rrrr-rrrr-rrrr-rrrrrrrrrrrr", + status="Success", + created="2024-12-28T16:19:31.917684523", + source="DryRun" ), - data=result + eventType="DryRun", + data=EventData( + title=self.title, + sanitizedName=self.sanitizedName, + searchTitles=self.searchTitles + ) ) - - logger.info(message) + + handler = MetadataEventHandler(evnet) + + asyncio.run(handler.run()) + def __getMetadata(self, titles: List[str]) -> Metadata | None: mal = Mal(titles=titles) diff --git a/apps/pyMetadata/algo/AdvancedMatcher.py b/apps/pyMetadata/algo/AdvancedMatcher.py index e33c2b0f..74cce98a 100644 --- a/apps/pyMetadata/algo/AdvancedMatcher.py +++ b/apps/pyMetadata/algo/AdvancedMatcher.py @@ -1,22 +1,32 @@ from fuzzywuzzy import fuzz +import re from .AlgorithmBase import AlgorithmBase, MatchResult from clazz.Metadata import Metadata class AdvancedMatcher(AlgorithmBase): + def clean_title(self, title: str) -> str: + # Fjerner eventuelle ekstra tekster etter kolon eller andre skilletegn + return re.sub(r'[:\-\—].*', '', title).strip() + def getBestMatch(self) -> Metadata | None: best_match = None best_score = -1 match_results = [] for title in self.titles: + cleaned_title = self.clean_title(title) # Renset tittel uten ekstra tekst for metadata in self.metadata: - # Compute different match ratios - title_ratio = fuzz.token_sort_ratio(title.lower(), metadata.title.lower()) - alt_title_ratios = [fuzz.token_sort_ratio(title.lower(), alt_title.lower()) for alt_title in metadata.altTitle] + cleaned_metadata_title = self.clean_title(metadata.title) # Renset metadata-tittel + + # Compute different match ratios for both the original and cleaned titles + original_title_ratio = fuzz.token_sort_ratio(title.lower(), metadata.title.lower()) + cleaned_title_ratio = fuzz.token_sort_ratio(cleaned_title.lower(), cleaned_metadata_title.lower()) + + alt_title_ratios = [fuzz.token_sort_ratio(cleaned_title.lower(), self.clean_title(alt_title).lower()) for alt_title in metadata.altTitle] max_alt_title_ratio = max(alt_title_ratios) if alt_title_ratios else 0 - # Combine ratios as desired - combined_score = max(title_ratio, max_alt_title_ratio) + # Combine ratios: take the best of original vs cleaned title, and alt title match + combined_score = max(original_title_ratio, cleaned_title_ratio, max_alt_title_ratio) match_results.append(MatchResult(title, metadata.title, combined_score, metadata.source, metadata)) @@ -28,4 +38,4 @@ class AdvancedMatcher(AlgorithmBase): # Print match summary self.print_match_summary(match_results) - return best_match + return best_match \ No newline at end of file diff --git a/apps/pyMetadata/app.py b/apps/pyMetadata/app.py index 4fbc8560..4075f589 100644 --- a/apps/pyMetadata/app.py +++ b/apps/pyMetadata/app.py @@ -250,14 +250,15 @@ class MetadataEventHandler: event: MediaEvent = self.mediaEvent - searchableTitles: List[str] = event.data.searchTitles - searchableTitles.extend([ + unique_titles = set(event.data.searchTitles) + unique_titles.update([ event.data.title, event.data.sanitizedName ]) + searchableTitles = list(unique_titles) joinedTitles = "\n".join(searchableTitles) - logger.info("Searching for: %s", joinedTitles) + logger.info("Searching for:\n%s", joinedTitles) # Kjør den asynkrone søkemetoden result: Metadata | None = await self.__getMetadata(searchableTitles) @@ -300,10 +301,10 @@ class MetadataEventHandler: logger.info("\nPrefix matcher") prefixSelector = PrefixMatcher(titles=titles, metadata=filtered_results).getBestMatch() - if simpleSelector is not None: - return simpleSelector if advancedSelector is not None: return advancedSelector + if simpleSelector is not None: + return simpleSelector if prefixSelector is not None: return prefixSelector return None