from fuzzywuzzy import fuzz import re from .AlgorithmBase import AlgorithmBase, MatchResult from clazz.Metadata import Metadata class AdvancedMatcher(AlgorithmBase): def clean_title(self, title: str) -> str: # Fjerner eventuelle ekstra tekster etter kolon eller andre skilletegn return re.sub(r'[:\-\—].*', '', title).strip() def getBestMatch(self) -> Metadata | None: best_match = None best_score = -1 match_results = [] for title in self.titles: cleaned_title = self.clean_title(title) # Renset tittel uten ekstra tekst for metadata in self.metadata: cleaned_metadata_title = self.clean_title(metadata.title) # Renset metadata-tittel # Compute different match ratios for both the original and cleaned titles original_title_ratio = fuzz.token_sort_ratio(title.lower(), metadata.title.lower()) cleaned_title_ratio = fuzz.token_sort_ratio(cleaned_title.lower(), cleaned_metadata_title.lower()) alt_title_ratios = [fuzz.token_sort_ratio(cleaned_title.lower(), self.clean_title(alt_title).lower()) for alt_title in metadata.altTitle] max_alt_title_ratio = max(alt_title_ratios) if alt_title_ratios else 0 # Combine ratios: take the best of original vs cleaned title, and alt title match combined_score = max(original_title_ratio, cleaned_title_ratio, max_alt_title_ratio) match_results.append(MatchResult(title, metadata.title, combined_score, metadata.source, metadata)) # Update best match if this one is better if combined_score > best_score: best_score = combined_score best_match = metadata if combined_score >= 70 else None # Print match summary self.print_match_summary(match_results) return best_match