MediaProcessing/apps/pyMetadata/algo/PrefixMatcher.py

import re
from typing import List, Optional
from fuzzywuzzy import fuzz, process
from .AlgorithmBase import AlgorithmBase, MatchResult
from clazz.Metadata import Metadata


class PrefixMatcher(AlgorithmBase):

    def preprocess_text(self, text: str) -> str:
        unitext = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
        return unitext.strip().lower()

    def source_priority(self, source: str) -> int:
        priority_map = {'mal': 1, 'anii': 2, 'imdb': 3}
        return priority_map.get(source, 4)

    def getBestMatch(self) -> Optional[Metadata]:
        best_match = None
        best_score = -1
        match_results: List[MatchResult] = []

        for title in self.titles:
            preprocessed_title = self.preprocess_text(title)[:1]

            for metadata in self.metadata:
                preprocessed_metadata_title = self.preprocess_text(metadata.title)[:1]

                # Match against metadata title
                score = fuzz.token_sort_ratio(preprocessed_title, preprocessed_metadata_title)
                match_results.append(MatchResult(title, metadata.title, score, metadata.source, metadata))
                if score > best_score:
                    best_score = score
                    best_match = metadata if score >= 70 else None

                # Match against metadata altTitles
                for alt_title in metadata.altTitle:
                    preprocessed_alt_title = self.preprocess_text(alt_title)[:1]
                    alt_score = fuzz.token_sort_ratio(preprocessed_title, preprocessed_alt_title)
                    match_results.append(MatchResult(title, alt_title, alt_score, metadata.source, metadata))
                    if alt_score > best_score:
                        best_score = alt_score
                        best_match = metadata if alt_score >= 70 else None

        match_results.sort(key=lambda x: (-x.score, self.source_priority(x.source)))

        # Print match summary
        self.print_match_summary(match_results)

        if match_results:
            top_result = match_results[0].data
            return top_result

        return best_match