MediaProcessing/apps/pyMetadata/algo/PrefixMatcher.py
2024-06-18 19:21:25 +02:00

55 lines
2.1 KiB
Python

import re
from typing import List, Optional
from fuzzywuzzy import fuzz, process
from .AlgorithmBase import AlgorithmBase, MatchResult
from clazz.Metadata import Metadata
class PrefixMatcher(AlgorithmBase):
def preprocess_text(self, text: str) -> str:
unitext = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
return unitext.strip().lower()
def source_priority(self, source: str) -> int:
priority_map = {'mal': 1, 'anii': 2, 'imdb': 3}
return priority_map.get(source, 4)
def getBestMatch(self) -> Optional[Metadata]:
best_match = None
best_score = -1
match_results: List[MatchResult] = []
for title in self.titles:
preprocessed_title = self.preprocess_text(title)[:1]
for metadata in self.metadata:
preprocessed_metadata_title = self.preprocess_text(metadata.title)[:1]
# Match against metadata title
score = fuzz.token_sort_ratio(preprocessed_title, preprocessed_metadata_title)
match_results.append(MatchResult(title, metadata.title, score, metadata.source, metadata))
if score > best_score:
best_score = score
best_match = metadata if score >= 70 else None
# Match against metadata altTitles
for alt_title in metadata.altTitle:
preprocessed_alt_title = self.preprocess_text(alt_title)[:1]
alt_score = fuzz.token_sort_ratio(preprocessed_title, preprocessed_alt_title)
match_results.append(MatchResult(title, alt_title, alt_score, metadata.source, metadata))
if alt_score > best_score:
best_score = alt_score
best_match = metadata if alt_score >= 70 else None
match_results.sort(key=lambda x: (-x.score, self.source_priority(x.source)))
# Print match summary
self.print_match_summary(match_results)
if match_results:
top_result = match_results[0].data
return top_result
return best_match