From f8c5c03438881bce497a9658b141c9a2ac5cd3ee Mon Sep 17 00:00:00 2001 From: bskjon Date: Tue, 18 Jun 2024 19:21:25 +0200 Subject: [PATCH] Added missing files --- apps/pyMetadata/DryRun.py | 82 ++++++++++++++++ apps/pyMetadata/algo/AdvancedMatcher.py | 31 ++++++ apps/pyMetadata/algo/AlgorithmBase.py | 33 +++++++ apps/pyMetadata/algo/PrefixMatcher.py | 54 ++++++++++ apps/pyMetadata/algo/SimpleMatcher.py | 33 +++++++ apps/pyMetadata/algo/SourceWeighted.py | 103 ++++++++++++++++++++ apps/pyMetadata/algo/__init__.py | 0 apps/pyMetadata/app.py | 6 +- apps/pyMetadata/clazz/KafkaMessageSchema.py | 38 ++++++++ apps/pyMetadata/clazz/Metadata.py | 26 +++++ apps/pyMetadata/clazz/__init__.py | 0 apps/pyMetadata/clazz/shared.py | 74 ++++++++++++++ apps/pyMetadata/sources/source.py | 60 ++++++++++++ 13 files changed, 537 insertions(+), 3 deletions(-) create mode 100644 apps/pyMetadata/DryRun.py create mode 100644 apps/pyMetadata/algo/AdvancedMatcher.py create mode 100644 apps/pyMetadata/algo/AlgorithmBase.py create mode 100644 apps/pyMetadata/algo/PrefixMatcher.py create mode 100644 apps/pyMetadata/algo/SimpleMatcher.py create mode 100644 apps/pyMetadata/algo/SourceWeighted.py create mode 100644 apps/pyMetadata/algo/__init__.py create mode 100644 apps/pyMetadata/clazz/KafkaMessageSchema.py create mode 100644 apps/pyMetadata/clazz/Metadata.py create mode 100644 apps/pyMetadata/clazz/__init__.py create mode 100644 apps/pyMetadata/clazz/shared.py create mode 100644 apps/pyMetadata/sources/source.py diff --git a/apps/pyMetadata/DryRun.py b/apps/pyMetadata/DryRun.py new file mode 100644 index 00000000..5b36ec69 --- /dev/null +++ b/apps/pyMetadata/DryRun.py @@ -0,0 +1,82 @@ +import logging +import signal +import sys +import os +from typing import List, Optional +import uuid +import threading +import json +import time +from fuzzywuzzy import fuzz + +from algo.AdvancedMatcher import AdvancedMatcher +from algo.SimpleMatcher import SimpleMatcher +from algo.PrefixMatcher import PrefixMatcher +from clazz.KafkaMessageSchema import KafkaMessage, MessageDataWrapper +from clazz.Metadata import Metadata + +from sources.anii import Anii +from sources.imdb import Imdb +from sources.mal import Mal + + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + handlers=[ + logging.StreamHandler(sys.stdout) + ] +) +logger = logging.getLogger(__name__) + + +class DryRun(): + titles: List[str] = [] + + def __init__(self, titles: List[str]) -> None: + self.titles = titles + + def run(self) -> None: + combined_titles = ", ".join(self.titles) + logger.info("Searching for %s", combined_titles) + result: Metadata | None = self.__getMetadata(self.titles) + + message: str | None = None + if (result is None): + message = f"No result for {combined_titles}" + logger.info(message) + + messageData = MessageDataWrapper( + status = "ERROR" if result is None else "COMPLETED", + message = message, + data = result, + derivedFromEventId = None + ) + + producerMessage = KafkaMessage(referenceId="DryRun..", data=messageData).to_json() + logger.info(producerMessage) + + def __getMetadata(self, titles: List[str]) -> Metadata | None: + mal = Mal(titles=titles) + anii = Anii(titles=titles) + imdb = Imdb(titles=titles) + + results: List[Metadata] = [ + mal.search(), + anii.search(), + imdb.search() + ] + filtered_results = [result for result in results if result is not None] + logger.info("Simple matcher") + simpleSelector = SimpleMatcher(titles=titles, metadata=filtered_results).getBestMatch() + logger.info("Advanced matcher") + advancedSelector = AdvancedMatcher(titles=titles, metadata=filtered_results).getBestMatch() + logger.info("Prefrix matcher") + prefixSelector = PrefixMatcher(titles=titles, metadata=filtered_results).getBestMatch() + if prefixSelector is not None: + return prefixSelector + if simpleSelector is not None: + return simpleSelector + if advancedSelector is not None: + return advancedSelector + return None \ No newline at end of file diff --git a/apps/pyMetadata/algo/AdvancedMatcher.py b/apps/pyMetadata/algo/AdvancedMatcher.py new file mode 100644 index 00000000..e33c2b0f --- /dev/null +++ b/apps/pyMetadata/algo/AdvancedMatcher.py @@ -0,0 +1,31 @@ +from fuzzywuzzy import fuzz +from .AlgorithmBase import AlgorithmBase, MatchResult +from clazz.Metadata import Metadata + +class AdvancedMatcher(AlgorithmBase): + def getBestMatch(self) -> Metadata | None: + best_match = None + best_score = -1 + match_results = [] + + for title in self.titles: + for metadata in self.metadata: + # Compute different match ratios + title_ratio = fuzz.token_sort_ratio(title.lower(), metadata.title.lower()) + alt_title_ratios = [fuzz.token_sort_ratio(title.lower(), alt_title.lower()) for alt_title in metadata.altTitle] + max_alt_title_ratio = max(alt_title_ratios) if alt_title_ratios else 0 + + # Combine ratios as desired + combined_score = max(title_ratio, max_alt_title_ratio) + + match_results.append(MatchResult(title, metadata.title, combined_score, metadata.source, metadata)) + + # Update best match if this one is better + if combined_score > best_score: + best_score = combined_score + best_match = metadata if combined_score >= 70 else None + + # Print match summary + self.print_match_summary(match_results) + + return best_match diff --git a/apps/pyMetadata/algo/AlgorithmBase.py b/apps/pyMetadata/algo/AlgorithmBase.py new file mode 100644 index 00000000..a8683fc9 --- /dev/null +++ b/apps/pyMetadata/algo/AlgorithmBase.py @@ -0,0 +1,33 @@ + + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import List + +from fuzzywuzzy import fuzz, process +from tabulate import tabulate + +from clazz.Metadata import Metadata + +@dataclass +class MatchResult: + title: str + matched_title: str + score: int + source: str + data: Metadata + + +class AlgorithmBase(ABC): + def __init__(self, titles: List[str], metadata: List[Metadata]): + self.titles = titles + self.metadata = metadata + + @abstractmethod + def getBestMatch(self) -> Metadata | None: + pass + + def print_match_summary(self, match_results: List[MatchResult]): + headers = ["Title", "Matched Title", "Score", "Source"] + data = [(result.title, result.matched_title, result.score, result.source) for result in match_results] + print(tabulate(data, headers=headers)) \ No newline at end of file diff --git a/apps/pyMetadata/algo/PrefixMatcher.py b/apps/pyMetadata/algo/PrefixMatcher.py new file mode 100644 index 00000000..5d6887f8 --- /dev/null +++ b/apps/pyMetadata/algo/PrefixMatcher.py @@ -0,0 +1,54 @@ +import re +from typing import List, Optional +from fuzzywuzzy import fuzz, process +from .AlgorithmBase import AlgorithmBase, MatchResult +from clazz.Metadata import Metadata + + +class PrefixMatcher(AlgorithmBase): + + def preprocess_text(self, text: str) -> str: + unitext = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) + return unitext.strip().lower() + + def source_priority(self, source: str) -> int: + priority_map = {'mal': 1, 'anii': 2, 'imdb': 3} + return priority_map.get(source, 4) + + def getBestMatch(self) -> Optional[Metadata]: + best_match = None + best_score = -1 + match_results: List[MatchResult] = [] + + for title in self.titles: + preprocessed_title = self.preprocess_text(title)[:1] + + for metadata in self.metadata: + preprocessed_metadata_title = self.preprocess_text(metadata.title)[:1] + + # Match against metadata title + score = fuzz.token_sort_ratio(preprocessed_title, preprocessed_metadata_title) + match_results.append(MatchResult(title, metadata.title, score, metadata.source, metadata)) + if score > best_score: + best_score = score + best_match = metadata if score >= 70 else None + + # Match against metadata altTitles + for alt_title in metadata.altTitle: + preprocessed_alt_title = self.preprocess_text(alt_title)[:1] + alt_score = fuzz.token_sort_ratio(preprocessed_title, preprocessed_alt_title) + match_results.append(MatchResult(title, alt_title, alt_score, metadata.source, metadata)) + if alt_score > best_score: + best_score = alt_score + best_match = metadata if alt_score >= 70 else None + + match_results.sort(key=lambda x: (-x.score, self.source_priority(x.source))) + + # Print match summary + self.print_match_summary(match_results) + + if match_results: + top_result = match_results[0].data + return top_result + + return best_match diff --git a/apps/pyMetadata/algo/SimpleMatcher.py b/apps/pyMetadata/algo/SimpleMatcher.py new file mode 100644 index 00000000..f9da2db2 --- /dev/null +++ b/apps/pyMetadata/algo/SimpleMatcher.py @@ -0,0 +1,33 @@ + +from fuzzywuzzy import fuzz, process +from .AlgorithmBase import AlgorithmBase, MatchResult +from clazz.Metadata import Metadata + + +class SimpleMatcher(AlgorithmBase): + def getBestMatch(self) -> Metadata | None: + best_match = None + best_score = -1 + match_results = [] + + for title in self.titles: + for metadata in self.metadata: + # Match against metadata title + score = fuzz.token_sort_ratio(title.lower(), metadata.title.lower()) + match_results.append(MatchResult(title, metadata.title, score, metadata.source, metadata)) + if score > best_score: + best_score = score + best_match = metadata if score >= 70 else None + + # Match against metadata altTitles + for alt_title in metadata.altTitle: + alt_score = fuzz.token_sort_ratio(title.lower(), alt_title.lower()) + match_results.append(MatchResult(title, alt_title, alt_score, metadata.source, metadata)) + if alt_score > best_score: + best_score = alt_score + best_match = metadata if alt_score >= 70 else None + + # Print match summary + self.print_match_summary(match_results) + + return best_match \ No newline at end of file diff --git a/apps/pyMetadata/algo/SourceWeighted.py b/apps/pyMetadata/algo/SourceWeighted.py new file mode 100644 index 00000000..0bb730d6 --- /dev/null +++ b/apps/pyMetadata/algo/SourceWeighted.py @@ -0,0 +1,103 @@ +from dataclasses import dataclass +from typing import List, Optional +from fuzzywuzzy import fuzz +from unidecode import unidecode +import logging +import re + +from clazz.Metadata import Metadata + +log = logging.getLogger(__name__) + +@dataclass +class WeightedData: + result: Metadata + weight: float + +@dataclass +class DataAndScore: + result: Metadata + score: float + weight: float + matched_title: str + + +class UseSource: + titles: List[str] = [] + dataWeighed: List[WeightedData] = [] + + def __init__(self, titles: List[str], mal: Optional[Metadata] = None, imdb: Optional[Metadata] = None, anii: Optional[Metadata] = None) -> None: + self.titles = titles + if mal is not None: + self.dataWeighed.append(WeightedData(mal, 1.5)) + + if imdb is not None: + self.dataWeighed.append(WeightedData(imdb, 1)) + + if anii is not None: + self.dataWeighed.append(WeightedData(anii, 1.3)) + + + def stripped(self, input_string) -> str: + unitext = unidecode(input_string) + unitext = re.sub(r'[^a-zA-Z0-9\s]', ' ', unitext) + unitext = re.sub(r'\s{2,}', ' ', unitext) + return unitext.strip() + + + def __calculate_score(self, title: str, weightData: List[WeightedData]) -> List[DataAndScore]: + result: List[DataAndScore] = [] + + for title_to_check in self.titles: + for wd in weightData: + if wd.result is None: + continue + + highScore = fuzz.ratio(self.stripped(title_to_check.lower()), self.stripped(wd.result.title.lower())) + for alt_title in wd.result.altTitle: + altScore = fuzz.ratio(self.stripped(title_to_check.lower()), self.stripped(alt_title.lower())) + if altScore > highScore: + highScore = altScore + givenScore = highScore * wd.weight + result.append(DataAndScore(wd.result, givenScore, wd.weight, title_to_check)) + + result.sort(key=lambda x: x.score, reverse=True) + return result + + def select_result_table(self) -> Optional[pd.DataFrame]: + scoredResults = [] + for title in self.titles: + scoredResult = self.__calculate_score(title=title, weightData=self.dataWeighed) + scoredResults.append(scoredResult) + + all_results = [item for sublist in scoredResults for item in sublist] + + if not all_results: + return None + + # Prepare data for DataFrame + data = { + "Title": [], + "Alt Title": [], + "Score": [], + "Weight": [], + "Matched Title": [] + } + + for ds in all_results: + metadata = ds.result + data["Title"].append(metadata.title) + data["Alt Title"].append(", ".join(metadata.altTitle)) + data["Score"].append(ds.score) + data["Weight"].append(ds.weight) + data["Matched Title"].append(ds.matched_title) + + df = pd.DataFrame(data) + df = df.sort_values(by="Score", ascending=False).reset_index(drop=True) + + try: + df.to_json(f"./logs/{self.titles[0]}.json", orient="records", indent=4) + except Exception as e: + log.error(f"Failed to dump JSON: {e}") + + return df diff --git a/apps/pyMetadata/algo/__init__.py b/apps/pyMetadata/algo/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/apps/pyMetadata/app.py b/apps/pyMetadata/app.py index 7cbc927d..f550681a 100644 --- a/apps/pyMetadata/app.py +++ b/apps/pyMetadata/app.py @@ -9,9 +9,9 @@ import json import time from fuzzywuzzy import fuzz -from algo.AdvancedMatcher import AdvancedMatcher -from algo.SimpleMatcher import SimpleMatcher -from algo.PrefixMatcher import PrefixMatcher +from .algo.AdvancedMatcher import AdvancedMatcher +from .algo.SimpleMatcher import SimpleMatcher +from .algo.PrefixMatcher import PrefixMatcher from clazz.shared import ConsumerRecord, MediaEvent, decode_key, decode_value, suppress_ignore, consume_on_key from clazz.KafkaMessageSchema import KafkaMessage, MessageDataWrapper from clazz.Metadata import Metadata diff --git a/apps/pyMetadata/clazz/KafkaMessageSchema.py b/apps/pyMetadata/clazz/KafkaMessageSchema.py new file mode 100644 index 00000000..b8901ec1 --- /dev/null +++ b/apps/pyMetadata/clazz/KafkaMessageSchema.py @@ -0,0 +1,38 @@ + + +from dataclasses import asdict, dataclass +import uuid, json + +from .Metadata import Metadata + + +@dataclass +class MessageDataWrapper: + status: str # COMPLETED / ERROR + message: str | None + data: Metadata | None + derivedFromEventId: str | None + + def to_dict(self): + return asdict(self) + + +class KafkaMessage: + referenceId: str + eventId: str = str(uuid.uuid4()) + data: MessageDataWrapper + + def __init__(self, referenceId: str, data: MessageDataWrapper) -> None: + self.referenceId = referenceId + self.data = data + pass + + def to_json(self): + payload = { + 'referenceId': self.referenceId, + 'eventId': self.eventId, + 'data': self.data.to_dict() if self.data else None + } + return json.dumps(payload) + + \ No newline at end of file diff --git a/apps/pyMetadata/clazz/Metadata.py b/apps/pyMetadata/clazz/Metadata.py new file mode 100644 index 00000000..112ec067 --- /dev/null +++ b/apps/pyMetadata/clazz/Metadata.py @@ -0,0 +1,26 @@ +from dataclasses import asdict, dataclass +from typing import List, Optional + +@dataclass +class Summary: + summary: str + language: str + + def to_dict(self): + return asdict(self) + + +@dataclass +class Metadata: + title: str + altTitle: List[str] + cover: str + banner: Optional[str] + type: str # Serie/Movie + summary: List[Summary] + genres: List[str] + source: str + + def to_dict(self): + return asdict(self) + diff --git a/apps/pyMetadata/clazz/__init__.py b/apps/pyMetadata/clazz/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/apps/pyMetadata/clazz/shared.py b/apps/pyMetadata/clazz/shared.py new file mode 100644 index 00000000..ced213ba --- /dev/null +++ b/apps/pyMetadata/clazz/shared.py @@ -0,0 +1,74 @@ + +from typing import Any, List +import json + + +suppress_ignore: List[str] = [ + "event:media-process:started", + "event:request-process:started", + "event::save", + "event:media-process:completed", + "event:work-encode:created", + "event:work-extract:created", + "event:work-convert:created", + "event:work-encode:performed", + "event:work-extract:performed", + "event:work-convert:performed", + "event:media-read-out-cover:performed", + "event:work-download-cover:performed", + "event:media-read-out-name-and-type:performed", + "event:media-parse-stream:performed", + "event:media-extract-parameter:created", + "event:media-encode-parameter:created", + "event:media-metadata-search:performed" +] + +consume_on_key: List[str] = [ + "request:metadata:obtain", + "event:media-read-base-info:performed" +] + +def decode_key(key_bytes: bytes | None): + return key_bytes.decode('utf-8') if key_bytes else None + +def decode_value(value_bytes: bytes | None): + return json.loads(value_bytes.decode('utf-8')) if value_bytes else None + + + +class ConsumerRecord: + topic: str + partition: int + offset: int + key: str + value: str | None + timestamp: int + + def __init__(self, message: Any) -> None: + if message is not None: + self.key = message.key + self.value = message.value + self.topic = message.topic + self.offset = message.offset + self.partition = message.partition + self.timestamp = message.timestamp + + +class MediaEvent(): + __consumerRecord: ConsumerRecord + referenceId: str + eventId: str + data: dict | None + + def __init__(self, message: ConsumerRecord) -> None: + self.__consumerRecord = message + self.referenceId = message.value["referenceId"] + self.eventId = message.value["eventId"] + self.data = message.value["data"] if "data" in message.value else None + + def isConsumable(self) -> bool: + if "status" in self.data: + if self.data["status"] == "COMPLETED": + return True + return False + diff --git a/apps/pyMetadata/sources/source.py b/apps/pyMetadata/sources/source.py new file mode 100644 index 00000000..44349a0e --- /dev/null +++ b/apps/pyMetadata/sources/source.py @@ -0,0 +1,60 @@ + +import logging, re +from abc import ABC, abstractmethod +from typing import List, Tuple + +from fuzzywuzzy import fuzz + +from clazz.Metadata import Metadata + +log = logging.getLogger(__name__) + +class SourceBase(ABC): + titles: List[str] = [] + + + def __init__(self, titles: List[str]) -> None: + self.titles = titles + + @abstractmethod + def search(self, ) -> Metadata | None: + pass + + @abstractmethod + def queryIds(self, title: str) -> dict[str, str]: + pass + + def isMatchOrPartial(self, source: str | None, title, foundTitle) -> bool: + titleParts = re.split(r'[^a-zA-Z0-9\s]', foundTitle) + clean_foundTitle: str | None = titleParts[0].strip() if titleParts else None + directMatch = fuzz.ratio(title, foundTitle) + partialMatch = fuzz.ratio(title, clean_foundTitle) if clean_foundTitle is not None else 0 + + if directMatch >= 60: + return True + elif partialMatch >= 80: + log.info(f"{source} -> Partial Match for '{title}' of '{foundTitle}' on part '{clean_foundTitle}' with direct score: {directMatch} and partial {partialMatch}") + return True + else: + log.info(f"{source} -> Match failed for '{title}' of '{foundTitle}' on part '{clean_foundTitle}' with direct score: {directMatch} and partial {partialMatch}") + return False + + + def findBestMatchAcrossTitles(self, idToTitle: dict[str, str], titles: List[str]) -> Tuple[str, str]: + best_match_id = "" + best_match_title = "" + best_ratio = 0 + + for title in titles: + for id, stored_title in idToTitle.items(): + ratio = fuzz.ratio(title, stored_title) + if ratio > best_ratio: + best_ratio = ratio + best_match_id = id + best_match_title = stored_title + + return best_match_id, best_match_title + + def logNoMatch(self, source: str, titles: List[str]) -> None: + combined_titles = ", ".join(titles) + log.info(f"No match in source {source} for titles: {combined_titles}") \ No newline at end of file