86 lines
3.4 KiB
Python
86 lines
3.4 KiB
Python
|
|
import logging, re
|
|
from abc import ABC, abstractmethod
|
|
from typing import List, Tuple
|
|
|
|
from fuzzywuzzy import fuzz
|
|
|
|
from models.metadata import Metadata
|
|
import asyncio
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
class SourceBase(ABC):
|
|
titles: List[str] = []
|
|
|
|
|
|
def __init__(self, titles: List[str]) -> None:
|
|
self.titles = titles
|
|
|
|
@abstractmethod
|
|
async def search(self, ) -> Metadata | None:
|
|
pass
|
|
|
|
@abstractmethod
|
|
async def queryIds(self, title: str) -> dict[str, str]:
|
|
pass
|
|
|
|
def isMatchOrPartial(self, source: str | None, title, foundTitle) -> bool:
|
|
titleParts = re.split(r'[^a-zA-Z0-9\s]', foundTitle)
|
|
clean_foundTitle: str | None = titleParts[0].strip() if titleParts else None
|
|
directMatch = fuzz.ratio(title, foundTitle)
|
|
partialMatch = fuzz.ratio(title, clean_foundTitle) if clean_foundTitle is not None else 0
|
|
|
|
if directMatch >= 60:
|
|
log.info(f"{source} -> Direct Match for '{title}' of '{foundTitle}' on part '{clean_foundTitle}' with direct score: {directMatch} and partial {partialMatch}")
|
|
return True
|
|
elif partialMatch >= 80:
|
|
log.info(f"{source} -> Partial Match for '{title}' of '{foundTitle}' on part '{clean_foundTitle}' with direct score: {directMatch} and partial {partialMatch}")
|
|
return True
|
|
else:
|
|
log.info(f"{source} -> Match failed for '{title}' of '{foundTitle}' on part '{clean_foundTitle}' with direct score: {directMatch} and partial {partialMatch}")
|
|
return False
|
|
|
|
|
|
def getMatchingOnTitleWords(self, idToTitle: dict[str, str], titles: List[str]) -> dict[str, str]:
|
|
matched_idToTitle = {}
|
|
|
|
for title in titles:
|
|
title_words = set(title.split())
|
|
for id, stored_title in idToTitle.items():
|
|
stored_title_words = set(stored_title.split())
|
|
if title_words & stored_title_words: # sjekker om det er et felles ord
|
|
score = fuzz.token_set_ratio(title, stored_title)
|
|
if score >= 75:
|
|
matched_idToTitle[id] = (stored_title, score)
|
|
|
|
# Returnerer den originale dict med score 0 hvis ingen titler matcher
|
|
if not matched_idToTitle:
|
|
for id, stored_title in idToTitle.items():
|
|
matched_idToTitle[id] = (stored_title, 0)
|
|
|
|
# Returnerer den originale dict hvis ingen titler matcher
|
|
return matched_idToTitle if matched_idToTitle else idToTitle
|
|
|
|
def findBestMatchAcrossTitles(self, idToTitle: dict[str, str], titles: List[str]) -> Tuple[str, str]:
|
|
# Få den filtrerte eller originale idToTitle basert på ordmatching
|
|
filtered_idToTitle = self.getMatchingOnTitleWords(idToTitle, titles)
|
|
|
|
best_match_id = ""
|
|
best_match_title = ""
|
|
best_ratio = 0
|
|
|
|
for title in titles:
|
|
for id, stored_title in filtered_idToTitle.items():
|
|
ratio = fuzz.ratio(title, stored_title[0])
|
|
if ratio > best_ratio:
|
|
best_ratio = ratio
|
|
best_match_id = id
|
|
best_match_title = stored_title
|
|
|
|
return best_match_id, best_match_title
|
|
|
|
def logNoMatch(self, source: str, titles: List[str]) -> None:
|
|
combined_titles = ", ".join(titles)
|
|
log.info(f"No match in source {source} for titles: {combined_titles}") |