112 lines
3.7 KiB
Python
112 lines
3.7 KiB
Python
from dataclasses import dataclass
|
|
from typing import List, Optional
|
|
from fuzzywuzzy import fuzz
|
|
from unidecode import unidecode
|
|
import logging
|
|
import re
|
|
|
|
from clazz.Metadata import Metadata
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
@dataclass
|
|
class WeightedData:
|
|
result: Metadata
|
|
weight: float
|
|
|
|
@dataclass
|
|
class DataAndScore:
|
|
result: Metadata
|
|
score: float
|
|
weight: float
|
|
matched_title: str
|
|
|
|
|
|
class UseSource:
|
|
titles: List[str] = []
|
|
dataWeighed: List[WeightedData] = []
|
|
|
|
def __init__(self, titles: List[str], mal: Optional[Metadata] = None, imdb: Optional[Metadata] = None, anii: Optional[Metadata] = None) -> None:
|
|
self.titles = titles
|
|
if mal is not None:
|
|
self.dataWeighed.append(WeightedData(mal, 1.5))
|
|
|
|
if imdb is not None:
|
|
self.dataWeighed.append(WeightedData(imdb, 1))
|
|
|
|
if anii is not None:
|
|
self.dataWeighed.append(WeightedData(anii, 1.3))
|
|
|
|
|
|
def stripped(self, input_string) -> str:
|
|
unitext = unidecode(input_string)
|
|
unitext = re.sub(r'[^a-zA-Z0-9\s]', ' ', unitext)
|
|
unitext = re.sub(r'\s{2,}', ' ', unitext)
|
|
return unitext.strip()
|
|
|
|
|
|
def __calculate_score(self, title: str, weightData: List[WeightedData]) -> List[DataAndScore]:
|
|
result: List[DataAndScore] = []
|
|
|
|
for title_to_check in self.titles:
|
|
for wd in weightData:
|
|
if wd.result is None:
|
|
continue
|
|
|
|
highScore = fuzz.ratio(self.stripped(title_to_check.lower()), self.stripped(wd.result.title.lower()))
|
|
for alt_title in wd.result.altTitle:
|
|
try:
|
|
altScore = fuzz.ratio(self.stripped(title_to_check.lower()), self.stripped(alt_title.lower()))
|
|
if altScore > highScore:
|
|
highScore = altScore
|
|
except Exception as e:
|
|
logging.debug("Unntak: {e}")
|
|
logging.debug(f"type(title): {type(title)}, value: {title}")
|
|
logging.debug(f"type(alt_title): {type(alt_title)}, value: {alt_title}")
|
|
logging.debug(f"Metadata objekt:")
|
|
logging.debug(weightData)
|
|
|
|
givenScore = highScore * wd.weight
|
|
result.append(DataAndScore(wd.result, givenScore, wd.weight, title_to_check))
|
|
|
|
result.sort(key=lambda x: x.score, reverse=True)
|
|
return result
|
|
|
|
def select_result_table(self) -> Optional[pd.DataFrame]:
|
|
scoredResults = []
|
|
for title in self.titles:
|
|
scoredResult = self.__calculate_score(title=title, weightData=self.dataWeighed)
|
|
scoredResults.append(scoredResult)
|
|
|
|
all_results = [item for sublist in scoredResults for item in sublist]
|
|
|
|
if not all_results:
|
|
return None
|
|
|
|
# Prepare data for DataFrame
|
|
data = {
|
|
"Title": [],
|
|
"Alt Title": [],
|
|
"Score": [],
|
|
"Weight": [],
|
|
"Matched Title": []
|
|
}
|
|
|
|
for ds in all_results:
|
|
metadata = ds.result
|
|
data["Title"].append(metadata.title)
|
|
data["Alt Title"].append(", ".join(metadata.altTitle))
|
|
data["Score"].append(ds.score)
|
|
data["Weight"].append(ds.weight)
|
|
data["Matched Title"].append(ds.matched_title)
|
|
|
|
df = pd.DataFrame(data)
|
|
df = df.sort_values(by="Score", ascending=False).reset_index(drop=True)
|
|
|
|
try:
|
|
df.to_json(f"./logs/{self.titles[0]}.json", orient="records", indent=4)
|
|
except Exception as e:
|
|
log.error(f"Failed to dump JSON: {e}")
|
|
|
|
return df
|