Added missing files

This commit is contained in:
bskjon 2024-06-18 19:21:25 +02:00
parent 4f0886b3fd
commit f8c5c03438
13 changed files with 537 additions and 3 deletions

82
apps/pyMetadata/DryRun.py Normal file
View File

@ -0,0 +1,82 @@
import logging
import signal
import sys
import os
from typing import List, Optional
import uuid
import threading
import json
import time
from fuzzywuzzy import fuzz
from algo.AdvancedMatcher import AdvancedMatcher
from algo.SimpleMatcher import SimpleMatcher
from algo.PrefixMatcher import PrefixMatcher
from clazz.KafkaMessageSchema import KafkaMessage, MessageDataWrapper
from clazz.Metadata import Metadata
from sources.anii import Anii
from sources.imdb import Imdb
from sources.mal import Mal
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger(__name__)
class DryRun():
titles: List[str] = []
def __init__(self, titles: List[str]) -> None:
self.titles = titles
def run(self) -> None:
combined_titles = ", ".join(self.titles)
logger.info("Searching for %s", combined_titles)
result: Metadata | None = self.__getMetadata(self.titles)
message: str | None = None
if (result is None):
message = f"No result for {combined_titles}"
logger.info(message)
messageData = MessageDataWrapper(
status = "ERROR" if result is None else "COMPLETED",
message = message,
data = result,
derivedFromEventId = None
)
producerMessage = KafkaMessage(referenceId="DryRun..", data=messageData).to_json()
logger.info(producerMessage)
def __getMetadata(self, titles: List[str]) -> Metadata | None:
mal = Mal(titles=titles)
anii = Anii(titles=titles)
imdb = Imdb(titles=titles)
results: List[Metadata] = [
mal.search(),
anii.search(),
imdb.search()
]
filtered_results = [result for result in results if result is not None]
logger.info("Simple matcher")
simpleSelector = SimpleMatcher(titles=titles, metadata=filtered_results).getBestMatch()
logger.info("Advanced matcher")
advancedSelector = AdvancedMatcher(titles=titles, metadata=filtered_results).getBestMatch()
logger.info("Prefrix matcher")
prefixSelector = PrefixMatcher(titles=titles, metadata=filtered_results).getBestMatch()
if prefixSelector is not None:
return prefixSelector
if simpleSelector is not None:
return simpleSelector
if advancedSelector is not None:
return advancedSelector
return None

View File

@ -0,0 +1,31 @@
from fuzzywuzzy import fuzz
from .AlgorithmBase import AlgorithmBase, MatchResult
from clazz.Metadata import Metadata
class AdvancedMatcher(AlgorithmBase):
def getBestMatch(self) -> Metadata | None:
best_match = None
best_score = -1
match_results = []
for title in self.titles:
for metadata in self.metadata:
# Compute different match ratios
title_ratio = fuzz.token_sort_ratio(title.lower(), metadata.title.lower())
alt_title_ratios = [fuzz.token_sort_ratio(title.lower(), alt_title.lower()) for alt_title in metadata.altTitle]
max_alt_title_ratio = max(alt_title_ratios) if alt_title_ratios else 0
# Combine ratios as desired
combined_score = max(title_ratio, max_alt_title_ratio)
match_results.append(MatchResult(title, metadata.title, combined_score, metadata.source, metadata))
# Update best match if this one is better
if combined_score > best_score:
best_score = combined_score
best_match = metadata if combined_score >= 70 else None
# Print match summary
self.print_match_summary(match_results)
return best_match

View File

@ -0,0 +1,33 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List
from fuzzywuzzy import fuzz, process
from tabulate import tabulate
from clazz.Metadata import Metadata
@dataclass
class MatchResult:
title: str
matched_title: str
score: int
source: str
data: Metadata
class AlgorithmBase(ABC):
def __init__(self, titles: List[str], metadata: List[Metadata]):
self.titles = titles
self.metadata = metadata
@abstractmethod
def getBestMatch(self) -> Metadata | None:
pass
def print_match_summary(self, match_results: List[MatchResult]):
headers = ["Title", "Matched Title", "Score", "Source"]
data = [(result.title, result.matched_title, result.score, result.source) for result in match_results]
print(tabulate(data, headers=headers))

View File

@ -0,0 +1,54 @@
import re
from typing import List, Optional
from fuzzywuzzy import fuzz, process
from .AlgorithmBase import AlgorithmBase, MatchResult
from clazz.Metadata import Metadata
class PrefixMatcher(AlgorithmBase):
def preprocess_text(self, text: str) -> str:
unitext = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
return unitext.strip().lower()
def source_priority(self, source: str) -> int:
priority_map = {'mal': 1, 'anii': 2, 'imdb': 3}
return priority_map.get(source, 4)
def getBestMatch(self) -> Optional[Metadata]:
best_match = None
best_score = -1
match_results: List[MatchResult] = []
for title in self.titles:
preprocessed_title = self.preprocess_text(title)[:1]
for metadata in self.metadata:
preprocessed_metadata_title = self.preprocess_text(metadata.title)[:1]
# Match against metadata title
score = fuzz.token_sort_ratio(preprocessed_title, preprocessed_metadata_title)
match_results.append(MatchResult(title, metadata.title, score, metadata.source, metadata))
if score > best_score:
best_score = score
best_match = metadata if score >= 70 else None
# Match against metadata altTitles
for alt_title in metadata.altTitle:
preprocessed_alt_title = self.preprocess_text(alt_title)[:1]
alt_score = fuzz.token_sort_ratio(preprocessed_title, preprocessed_alt_title)
match_results.append(MatchResult(title, alt_title, alt_score, metadata.source, metadata))
if alt_score > best_score:
best_score = alt_score
best_match = metadata if alt_score >= 70 else None
match_results.sort(key=lambda x: (-x.score, self.source_priority(x.source)))
# Print match summary
self.print_match_summary(match_results)
if match_results:
top_result = match_results[0].data
return top_result
return best_match

View File

@ -0,0 +1,33 @@
from fuzzywuzzy import fuzz, process
from .AlgorithmBase import AlgorithmBase, MatchResult
from clazz.Metadata import Metadata
class SimpleMatcher(AlgorithmBase):
def getBestMatch(self) -> Metadata | None:
best_match = None
best_score = -1
match_results = []
for title in self.titles:
for metadata in self.metadata:
# Match against metadata title
score = fuzz.token_sort_ratio(title.lower(), metadata.title.lower())
match_results.append(MatchResult(title, metadata.title, score, metadata.source, metadata))
if score > best_score:
best_score = score
best_match = metadata if score >= 70 else None
# Match against metadata altTitles
for alt_title in metadata.altTitle:
alt_score = fuzz.token_sort_ratio(title.lower(), alt_title.lower())
match_results.append(MatchResult(title, alt_title, alt_score, metadata.source, metadata))
if alt_score > best_score:
best_score = alt_score
best_match = metadata if alt_score >= 70 else None
# Print match summary
self.print_match_summary(match_results)
return best_match

View File

@ -0,0 +1,103 @@
from dataclasses import dataclass
from typing import List, Optional
from fuzzywuzzy import fuzz
from unidecode import unidecode
import logging
import re
from clazz.Metadata import Metadata
log = logging.getLogger(__name__)
@dataclass
class WeightedData:
result: Metadata
weight: float
@dataclass
class DataAndScore:
result: Metadata
score: float
weight: float
matched_title: str
class UseSource:
titles: List[str] = []
dataWeighed: List[WeightedData] = []
def __init__(self, titles: List[str], mal: Optional[Metadata] = None, imdb: Optional[Metadata] = None, anii: Optional[Metadata] = None) -> None:
self.titles = titles
if mal is not None:
self.dataWeighed.append(WeightedData(mal, 1.5))
if imdb is not None:
self.dataWeighed.append(WeightedData(imdb, 1))
if anii is not None:
self.dataWeighed.append(WeightedData(anii, 1.3))
def stripped(self, input_string) -> str:
unitext = unidecode(input_string)
unitext = re.sub(r'[^a-zA-Z0-9\s]', ' ', unitext)
unitext = re.sub(r'\s{2,}', ' ', unitext)
return unitext.strip()
def __calculate_score(self, title: str, weightData: List[WeightedData]) -> List[DataAndScore]:
result: List[DataAndScore] = []
for title_to_check in self.titles:
for wd in weightData:
if wd.result is None:
continue
highScore = fuzz.ratio(self.stripped(title_to_check.lower()), self.stripped(wd.result.title.lower()))
for alt_title in wd.result.altTitle:
altScore = fuzz.ratio(self.stripped(title_to_check.lower()), self.stripped(alt_title.lower()))
if altScore > highScore:
highScore = altScore
givenScore = highScore * wd.weight
result.append(DataAndScore(wd.result, givenScore, wd.weight, title_to_check))
result.sort(key=lambda x: x.score, reverse=True)
return result
def select_result_table(self) -> Optional[pd.DataFrame]:
scoredResults = []
for title in self.titles:
scoredResult = self.__calculate_score(title=title, weightData=self.dataWeighed)
scoredResults.append(scoredResult)
all_results = [item for sublist in scoredResults for item in sublist]
if not all_results:
return None
# Prepare data for DataFrame
data = {
"Title": [],
"Alt Title": [],
"Score": [],
"Weight": [],
"Matched Title": []
}
for ds in all_results:
metadata = ds.result
data["Title"].append(metadata.title)
data["Alt Title"].append(", ".join(metadata.altTitle))
data["Score"].append(ds.score)
data["Weight"].append(ds.weight)
data["Matched Title"].append(ds.matched_title)
df = pd.DataFrame(data)
df = df.sort_values(by="Score", ascending=False).reset_index(drop=True)
try:
df.to_json(f"./logs/{self.titles[0]}.json", orient="records", indent=4)
except Exception as e:
log.error(f"Failed to dump JSON: {e}")
return df

View File

View File

@ -9,9 +9,9 @@ import json
import time import time
from fuzzywuzzy import fuzz from fuzzywuzzy import fuzz
from algo.AdvancedMatcher import AdvancedMatcher from .algo.AdvancedMatcher import AdvancedMatcher
from algo.SimpleMatcher import SimpleMatcher from .algo.SimpleMatcher import SimpleMatcher
from algo.PrefixMatcher import PrefixMatcher from .algo.PrefixMatcher import PrefixMatcher
from clazz.shared import ConsumerRecord, MediaEvent, decode_key, decode_value, suppress_ignore, consume_on_key from clazz.shared import ConsumerRecord, MediaEvent, decode_key, decode_value, suppress_ignore, consume_on_key
from clazz.KafkaMessageSchema import KafkaMessage, MessageDataWrapper from clazz.KafkaMessageSchema import KafkaMessage, MessageDataWrapper
from clazz.Metadata import Metadata from clazz.Metadata import Metadata

View File

@ -0,0 +1,38 @@
from dataclasses import asdict, dataclass
import uuid, json
from .Metadata import Metadata
@dataclass
class MessageDataWrapper:
status: str # COMPLETED / ERROR
message: str | None
data: Metadata | None
derivedFromEventId: str | None
def to_dict(self):
return asdict(self)
class KafkaMessage:
referenceId: str
eventId: str = str(uuid.uuid4())
data: MessageDataWrapper
def __init__(self, referenceId: str, data: MessageDataWrapper) -> None:
self.referenceId = referenceId
self.data = data
pass
def to_json(self):
payload = {
'referenceId': self.referenceId,
'eventId': self.eventId,
'data': self.data.to_dict() if self.data else None
}
return json.dumps(payload)

View File

@ -0,0 +1,26 @@
from dataclasses import asdict, dataclass
from typing import List, Optional
@dataclass
class Summary:
summary: str
language: str
def to_dict(self):
return asdict(self)
@dataclass
class Metadata:
title: str
altTitle: List[str]
cover: str
banner: Optional[str]
type: str # Serie/Movie
summary: List[Summary]
genres: List[str]
source: str
def to_dict(self):
return asdict(self)

View File

View File

@ -0,0 +1,74 @@
from typing import Any, List
import json
suppress_ignore: List[str] = [
"event:media-process:started",
"event:request-process:started",
"event::save",
"event:media-process:completed",
"event:work-encode:created",
"event:work-extract:created",
"event:work-convert:created",
"event:work-encode:performed",
"event:work-extract:performed",
"event:work-convert:performed",
"event:media-read-out-cover:performed",
"event:work-download-cover:performed",
"event:media-read-out-name-and-type:performed",
"event:media-parse-stream:performed",
"event:media-extract-parameter:created",
"event:media-encode-parameter:created",
"event:media-metadata-search:performed"
]
consume_on_key: List[str] = [
"request:metadata:obtain",
"event:media-read-base-info:performed"
]
def decode_key(key_bytes: bytes | None):
return key_bytes.decode('utf-8') if key_bytes else None
def decode_value(value_bytes: bytes | None):
return json.loads(value_bytes.decode('utf-8')) if value_bytes else None
class ConsumerRecord:
topic: str
partition: int
offset: int
key: str
value: str | None
timestamp: int
def __init__(self, message: Any) -> None:
if message is not None:
self.key = message.key
self.value = message.value
self.topic = message.topic
self.offset = message.offset
self.partition = message.partition
self.timestamp = message.timestamp
class MediaEvent():
__consumerRecord: ConsumerRecord
referenceId: str
eventId: str
data: dict | None
def __init__(self, message: ConsumerRecord) -> None:
self.__consumerRecord = message
self.referenceId = message.value["referenceId"]
self.eventId = message.value["eventId"]
self.data = message.value["data"] if "data" in message.value else None
def isConsumable(self) -> bool:
if "status" in self.data:
if self.data["status"] == "COMPLETED":
return True
return False

View File

@ -0,0 +1,60 @@
import logging, re
from abc import ABC, abstractmethod
from typing import List, Tuple
from fuzzywuzzy import fuzz
from clazz.Metadata import Metadata
log = logging.getLogger(__name__)
class SourceBase(ABC):
titles: List[str] = []
def __init__(self, titles: List[str]) -> None:
self.titles = titles
@abstractmethod
def search(self, ) -> Metadata | None:
pass
@abstractmethod
def queryIds(self, title: str) -> dict[str, str]:
pass
def isMatchOrPartial(self, source: str | None, title, foundTitle) -> bool:
titleParts = re.split(r'[^a-zA-Z0-9\s]', foundTitle)
clean_foundTitle: str | None = titleParts[0].strip() if titleParts else None
directMatch = fuzz.ratio(title, foundTitle)
partialMatch = fuzz.ratio(title, clean_foundTitle) if clean_foundTitle is not None else 0
if directMatch >= 60:
return True
elif partialMatch >= 80:
log.info(f"{source} -> Partial Match for '{title}' of '{foundTitle}' on part '{clean_foundTitle}' with direct score: {directMatch} and partial {partialMatch}")
return True
else:
log.info(f"{source} -> Match failed for '{title}' of '{foundTitle}' on part '{clean_foundTitle}' with direct score: {directMatch} and partial {partialMatch}")
return False
def findBestMatchAcrossTitles(self, idToTitle: dict[str, str], titles: List[str]) -> Tuple[str, str]:
best_match_id = ""
best_match_title = ""
best_ratio = 0
for title in titles:
for id, stored_title in idToTitle.items():
ratio = fuzz.ratio(title, stored_title)
if ratio > best_ratio:
best_ratio = ratio
best_match_id = id
best_match_title = stored_title
return best_match_id, best_match_title
def logNoMatch(self, source: str, titles: List[str]) -> None:
combined_titles = ", ".join(titles)
log.info(f"No match in source {source} for titles: {combined_titles}")