MediaProcessing/apps/pyMetadata/sources/source.py
2026-01-02 01:09:26 +01:00

86 lines
3.4 KiB
Python

import logging, re
from abc import ABC, abstractmethod
from typing import List, Tuple
from fuzzywuzzy import fuzz
from models.metadata import Metadata
import asyncio
log = logging.getLogger(__name__)
class SourceBase(ABC):
titles: List[str] = []
def __init__(self, titles: List[str]) -> None:
self.titles = titles
@abstractmethod
async def search(self, ) -> Metadata | None:
pass
@abstractmethod
async def queryIds(self, title: str) -> dict[str, str]:
pass
def isMatchOrPartial(self, source: str | None, title, foundTitle) -> bool:
titleParts = re.split(r'[^a-zA-Z0-9\s]', foundTitle)
clean_foundTitle: str | None = titleParts[0].strip() if titleParts else None
directMatch = fuzz.ratio(title, foundTitle)
partialMatch = fuzz.ratio(title, clean_foundTitle) if clean_foundTitle is not None else 0
if directMatch >= 60:
log.info(f"{source} -> Direct Match for '{title}' of '{foundTitle}' on part '{clean_foundTitle}' with direct score: {directMatch} and partial {partialMatch}")
return True
elif partialMatch >= 80:
log.info(f"{source} -> Partial Match for '{title}' of '{foundTitle}' on part '{clean_foundTitle}' with direct score: {directMatch} and partial {partialMatch}")
return True
else:
log.info(f"{source} -> Match failed for '{title}' of '{foundTitle}' on part '{clean_foundTitle}' with direct score: {directMatch} and partial {partialMatch}")
return False
def getMatchingOnTitleWords(self, idToTitle: dict[str, str], titles: List[str]) -> dict[str, str]:
matched_idToTitle = {}
for title in titles:
title_words = set(title.split())
for id, stored_title in idToTitle.items():
stored_title_words = set(stored_title.split())
if title_words & stored_title_words: # sjekker om det er et felles ord
score = fuzz.token_set_ratio(title, stored_title)
if score >= 75:
matched_idToTitle[id] = (stored_title, score)
# Returnerer den originale dict med score 0 hvis ingen titler matcher
if not matched_idToTitle:
for id, stored_title in idToTitle.items():
matched_idToTitle[id] = (stored_title, 0)
# Returnerer den originale dict hvis ingen titler matcher
return matched_idToTitle if matched_idToTitle else idToTitle
def findBestMatchAcrossTitles(self, idToTitle: dict[str, str], titles: List[str]) -> Tuple[str, str]:
# Få den filtrerte eller originale idToTitle basert på ordmatching
filtered_idToTitle = self.getMatchingOnTitleWords(idToTitle, titles)
best_match_id = ""
best_match_title = ""
best_ratio = 0
for title in titles:
for id, stored_title in filtered_idToTitle.items():
ratio = fuzz.ratio(title, stored_title[0])
if ratio > best_ratio:
best_ratio = ratio
best_match_id = id
best_match_title = stored_title
return best_match_id, best_match_title
def logNoMatch(self, source: str, titles: List[str]) -> None:
combined_titles = ", ".join(titles)
log.info(f"No match in source {source} for titles: {combined_titles}")