Added missing files
This commit is contained in:
parent
4f0886b3fd
commit
f8c5c03438
82
apps/pyMetadata/DryRun.py
Normal file
82
apps/pyMetadata/DryRun.py
Normal file
@ -0,0 +1,82 @@
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
from typing import List, Optional
|
||||
import uuid
|
||||
import threading
|
||||
import json
|
||||
import time
|
||||
from fuzzywuzzy import fuzz
|
||||
|
||||
from algo.AdvancedMatcher import AdvancedMatcher
|
||||
from algo.SimpleMatcher import SimpleMatcher
|
||||
from algo.PrefixMatcher import PrefixMatcher
|
||||
from clazz.KafkaMessageSchema import KafkaMessage, MessageDataWrapper
|
||||
from clazz.Metadata import Metadata
|
||||
|
||||
from sources.anii import Anii
|
||||
from sources.imdb import Imdb
|
||||
from sources.mal import Mal
|
||||
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DryRun():
|
||||
titles: List[str] = []
|
||||
|
||||
def __init__(self, titles: List[str]) -> None:
|
||||
self.titles = titles
|
||||
|
||||
def run(self) -> None:
|
||||
combined_titles = ", ".join(self.titles)
|
||||
logger.info("Searching for %s", combined_titles)
|
||||
result: Metadata | None = self.__getMetadata(self.titles)
|
||||
|
||||
message: str | None = None
|
||||
if (result is None):
|
||||
message = f"No result for {combined_titles}"
|
||||
logger.info(message)
|
||||
|
||||
messageData = MessageDataWrapper(
|
||||
status = "ERROR" if result is None else "COMPLETED",
|
||||
message = message,
|
||||
data = result,
|
||||
derivedFromEventId = None
|
||||
)
|
||||
|
||||
producerMessage = KafkaMessage(referenceId="DryRun..", data=messageData).to_json()
|
||||
logger.info(producerMessage)
|
||||
|
||||
def __getMetadata(self, titles: List[str]) -> Metadata | None:
|
||||
mal = Mal(titles=titles)
|
||||
anii = Anii(titles=titles)
|
||||
imdb = Imdb(titles=titles)
|
||||
|
||||
results: List[Metadata] = [
|
||||
mal.search(),
|
||||
anii.search(),
|
||||
imdb.search()
|
||||
]
|
||||
filtered_results = [result for result in results if result is not None]
|
||||
logger.info("Simple matcher")
|
||||
simpleSelector = SimpleMatcher(titles=titles, metadata=filtered_results).getBestMatch()
|
||||
logger.info("Advanced matcher")
|
||||
advancedSelector = AdvancedMatcher(titles=titles, metadata=filtered_results).getBestMatch()
|
||||
logger.info("Prefrix matcher")
|
||||
prefixSelector = PrefixMatcher(titles=titles, metadata=filtered_results).getBestMatch()
|
||||
if prefixSelector is not None:
|
||||
return prefixSelector
|
||||
if simpleSelector is not None:
|
||||
return simpleSelector
|
||||
if advancedSelector is not None:
|
||||
return advancedSelector
|
||||
return None
|
||||
31
apps/pyMetadata/algo/AdvancedMatcher.py
Normal file
31
apps/pyMetadata/algo/AdvancedMatcher.py
Normal file
@ -0,0 +1,31 @@
|
||||
from fuzzywuzzy import fuzz
|
||||
from .AlgorithmBase import AlgorithmBase, MatchResult
|
||||
from clazz.Metadata import Metadata
|
||||
|
||||
class AdvancedMatcher(AlgorithmBase):
|
||||
def getBestMatch(self) -> Metadata | None:
|
||||
best_match = None
|
||||
best_score = -1
|
||||
match_results = []
|
||||
|
||||
for title in self.titles:
|
||||
for metadata in self.metadata:
|
||||
# Compute different match ratios
|
||||
title_ratio = fuzz.token_sort_ratio(title.lower(), metadata.title.lower())
|
||||
alt_title_ratios = [fuzz.token_sort_ratio(title.lower(), alt_title.lower()) for alt_title in metadata.altTitle]
|
||||
max_alt_title_ratio = max(alt_title_ratios) if alt_title_ratios else 0
|
||||
|
||||
# Combine ratios as desired
|
||||
combined_score = max(title_ratio, max_alt_title_ratio)
|
||||
|
||||
match_results.append(MatchResult(title, metadata.title, combined_score, metadata.source, metadata))
|
||||
|
||||
# Update best match if this one is better
|
||||
if combined_score > best_score:
|
||||
best_score = combined_score
|
||||
best_match = metadata if combined_score >= 70 else None
|
||||
|
||||
# Print match summary
|
||||
self.print_match_summary(match_results)
|
||||
|
||||
return best_match
|
||||
33
apps/pyMetadata/algo/AlgorithmBase.py
Normal file
33
apps/pyMetadata/algo/AlgorithmBase.py
Normal file
@ -0,0 +1,33 @@
|
||||
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
from fuzzywuzzy import fuzz, process
|
||||
from tabulate import tabulate
|
||||
|
||||
from clazz.Metadata import Metadata
|
||||
|
||||
@dataclass
|
||||
class MatchResult:
|
||||
title: str
|
||||
matched_title: str
|
||||
score: int
|
||||
source: str
|
||||
data: Metadata
|
||||
|
||||
|
||||
class AlgorithmBase(ABC):
|
||||
def __init__(self, titles: List[str], metadata: List[Metadata]):
|
||||
self.titles = titles
|
||||
self.metadata = metadata
|
||||
|
||||
@abstractmethod
|
||||
def getBestMatch(self) -> Metadata | None:
|
||||
pass
|
||||
|
||||
def print_match_summary(self, match_results: List[MatchResult]):
|
||||
headers = ["Title", "Matched Title", "Score", "Source"]
|
||||
data = [(result.title, result.matched_title, result.score, result.source) for result in match_results]
|
||||
print(tabulate(data, headers=headers))
|
||||
54
apps/pyMetadata/algo/PrefixMatcher.py
Normal file
54
apps/pyMetadata/algo/PrefixMatcher.py
Normal file
@ -0,0 +1,54 @@
|
||||
import re
|
||||
from typing import List, Optional
|
||||
from fuzzywuzzy import fuzz, process
|
||||
from .AlgorithmBase import AlgorithmBase, MatchResult
|
||||
from clazz.Metadata import Metadata
|
||||
|
||||
|
||||
class PrefixMatcher(AlgorithmBase):
|
||||
|
||||
def preprocess_text(self, text: str) -> str:
|
||||
unitext = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
|
||||
return unitext.strip().lower()
|
||||
|
||||
def source_priority(self, source: str) -> int:
|
||||
priority_map = {'mal': 1, 'anii': 2, 'imdb': 3}
|
||||
return priority_map.get(source, 4)
|
||||
|
||||
def getBestMatch(self) -> Optional[Metadata]:
|
||||
best_match = None
|
||||
best_score = -1
|
||||
match_results: List[MatchResult] = []
|
||||
|
||||
for title in self.titles:
|
||||
preprocessed_title = self.preprocess_text(title)[:1]
|
||||
|
||||
for metadata in self.metadata:
|
||||
preprocessed_metadata_title = self.preprocess_text(metadata.title)[:1]
|
||||
|
||||
# Match against metadata title
|
||||
score = fuzz.token_sort_ratio(preprocessed_title, preprocessed_metadata_title)
|
||||
match_results.append(MatchResult(title, metadata.title, score, metadata.source, metadata))
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = metadata if score >= 70 else None
|
||||
|
||||
# Match against metadata altTitles
|
||||
for alt_title in metadata.altTitle:
|
||||
preprocessed_alt_title = self.preprocess_text(alt_title)[:1]
|
||||
alt_score = fuzz.token_sort_ratio(preprocessed_title, preprocessed_alt_title)
|
||||
match_results.append(MatchResult(title, alt_title, alt_score, metadata.source, metadata))
|
||||
if alt_score > best_score:
|
||||
best_score = alt_score
|
||||
best_match = metadata if alt_score >= 70 else None
|
||||
|
||||
match_results.sort(key=lambda x: (-x.score, self.source_priority(x.source)))
|
||||
|
||||
# Print match summary
|
||||
self.print_match_summary(match_results)
|
||||
|
||||
if match_results:
|
||||
top_result = match_results[0].data
|
||||
return top_result
|
||||
|
||||
return best_match
|
||||
33
apps/pyMetadata/algo/SimpleMatcher.py
Normal file
33
apps/pyMetadata/algo/SimpleMatcher.py
Normal file
@ -0,0 +1,33 @@
|
||||
|
||||
from fuzzywuzzy import fuzz, process
|
||||
from .AlgorithmBase import AlgorithmBase, MatchResult
|
||||
from clazz.Metadata import Metadata
|
||||
|
||||
|
||||
class SimpleMatcher(AlgorithmBase):
|
||||
def getBestMatch(self) -> Metadata | None:
|
||||
best_match = None
|
||||
best_score = -1
|
||||
match_results = []
|
||||
|
||||
for title in self.titles:
|
||||
for metadata in self.metadata:
|
||||
# Match against metadata title
|
||||
score = fuzz.token_sort_ratio(title.lower(), metadata.title.lower())
|
||||
match_results.append(MatchResult(title, metadata.title, score, metadata.source, metadata))
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = metadata if score >= 70 else None
|
||||
|
||||
# Match against metadata altTitles
|
||||
for alt_title in metadata.altTitle:
|
||||
alt_score = fuzz.token_sort_ratio(title.lower(), alt_title.lower())
|
||||
match_results.append(MatchResult(title, alt_title, alt_score, metadata.source, metadata))
|
||||
if alt_score > best_score:
|
||||
best_score = alt_score
|
||||
best_match = metadata if alt_score >= 70 else None
|
||||
|
||||
# Print match summary
|
||||
self.print_match_summary(match_results)
|
||||
|
||||
return best_match
|
||||
103
apps/pyMetadata/algo/SourceWeighted.py
Normal file
103
apps/pyMetadata/algo/SourceWeighted.py
Normal file
@ -0,0 +1,103 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional
|
||||
from fuzzywuzzy import fuzz
|
||||
from unidecode import unidecode
|
||||
import logging
|
||||
import re
|
||||
|
||||
from clazz.Metadata import Metadata
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
@dataclass
|
||||
class WeightedData:
|
||||
result: Metadata
|
||||
weight: float
|
||||
|
||||
@dataclass
|
||||
class DataAndScore:
|
||||
result: Metadata
|
||||
score: float
|
||||
weight: float
|
||||
matched_title: str
|
||||
|
||||
|
||||
class UseSource:
|
||||
titles: List[str] = []
|
||||
dataWeighed: List[WeightedData] = []
|
||||
|
||||
def __init__(self, titles: List[str], mal: Optional[Metadata] = None, imdb: Optional[Metadata] = None, anii: Optional[Metadata] = None) -> None:
|
||||
self.titles = titles
|
||||
if mal is not None:
|
||||
self.dataWeighed.append(WeightedData(mal, 1.5))
|
||||
|
||||
if imdb is not None:
|
||||
self.dataWeighed.append(WeightedData(imdb, 1))
|
||||
|
||||
if anii is not None:
|
||||
self.dataWeighed.append(WeightedData(anii, 1.3))
|
||||
|
||||
|
||||
def stripped(self, input_string) -> str:
|
||||
unitext = unidecode(input_string)
|
||||
unitext = re.sub(r'[^a-zA-Z0-9\s]', ' ', unitext)
|
||||
unitext = re.sub(r'\s{2,}', ' ', unitext)
|
||||
return unitext.strip()
|
||||
|
||||
|
||||
def __calculate_score(self, title: str, weightData: List[WeightedData]) -> List[DataAndScore]:
|
||||
result: List[DataAndScore] = []
|
||||
|
||||
for title_to_check in self.titles:
|
||||
for wd in weightData:
|
||||
if wd.result is None:
|
||||
continue
|
||||
|
||||
highScore = fuzz.ratio(self.stripped(title_to_check.lower()), self.stripped(wd.result.title.lower()))
|
||||
for alt_title in wd.result.altTitle:
|
||||
altScore = fuzz.ratio(self.stripped(title_to_check.lower()), self.stripped(alt_title.lower()))
|
||||
if altScore > highScore:
|
||||
highScore = altScore
|
||||
givenScore = highScore * wd.weight
|
||||
result.append(DataAndScore(wd.result, givenScore, wd.weight, title_to_check))
|
||||
|
||||
result.sort(key=lambda x: x.score, reverse=True)
|
||||
return result
|
||||
|
||||
def select_result_table(self) -> Optional[pd.DataFrame]:
|
||||
scoredResults = []
|
||||
for title in self.titles:
|
||||
scoredResult = self.__calculate_score(title=title, weightData=self.dataWeighed)
|
||||
scoredResults.append(scoredResult)
|
||||
|
||||
all_results = [item for sublist in scoredResults for item in sublist]
|
||||
|
||||
if not all_results:
|
||||
return None
|
||||
|
||||
# Prepare data for DataFrame
|
||||
data = {
|
||||
"Title": [],
|
||||
"Alt Title": [],
|
||||
"Score": [],
|
||||
"Weight": [],
|
||||
"Matched Title": []
|
||||
}
|
||||
|
||||
for ds in all_results:
|
||||
metadata = ds.result
|
||||
data["Title"].append(metadata.title)
|
||||
data["Alt Title"].append(", ".join(metadata.altTitle))
|
||||
data["Score"].append(ds.score)
|
||||
data["Weight"].append(ds.weight)
|
||||
data["Matched Title"].append(ds.matched_title)
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
df = df.sort_values(by="Score", ascending=False).reset_index(drop=True)
|
||||
|
||||
try:
|
||||
df.to_json(f"./logs/{self.titles[0]}.json", orient="records", indent=4)
|
||||
except Exception as e:
|
||||
log.error(f"Failed to dump JSON: {e}")
|
||||
|
||||
return df
|
||||
0
apps/pyMetadata/algo/__init__.py
Normal file
0
apps/pyMetadata/algo/__init__.py
Normal file
@ -9,9 +9,9 @@ import json
|
||||
import time
|
||||
from fuzzywuzzy import fuzz
|
||||
|
||||
from algo.AdvancedMatcher import AdvancedMatcher
|
||||
from algo.SimpleMatcher import SimpleMatcher
|
||||
from algo.PrefixMatcher import PrefixMatcher
|
||||
from .algo.AdvancedMatcher import AdvancedMatcher
|
||||
from .algo.SimpleMatcher import SimpleMatcher
|
||||
from .algo.PrefixMatcher import PrefixMatcher
|
||||
from clazz.shared import ConsumerRecord, MediaEvent, decode_key, decode_value, suppress_ignore, consume_on_key
|
||||
from clazz.KafkaMessageSchema import KafkaMessage, MessageDataWrapper
|
||||
from clazz.Metadata import Metadata
|
||||
|
||||
38
apps/pyMetadata/clazz/KafkaMessageSchema.py
Normal file
38
apps/pyMetadata/clazz/KafkaMessageSchema.py
Normal file
@ -0,0 +1,38 @@
|
||||
|
||||
|
||||
from dataclasses import asdict, dataclass
|
||||
import uuid, json
|
||||
|
||||
from .Metadata import Metadata
|
||||
|
||||
|
||||
@dataclass
|
||||
class MessageDataWrapper:
|
||||
status: str # COMPLETED / ERROR
|
||||
message: str | None
|
||||
data: Metadata | None
|
||||
derivedFromEventId: str | None
|
||||
|
||||
def to_dict(self):
|
||||
return asdict(self)
|
||||
|
||||
|
||||
class KafkaMessage:
|
||||
referenceId: str
|
||||
eventId: str = str(uuid.uuid4())
|
||||
data: MessageDataWrapper
|
||||
|
||||
def __init__(self, referenceId: str, data: MessageDataWrapper) -> None:
|
||||
self.referenceId = referenceId
|
||||
self.data = data
|
||||
pass
|
||||
|
||||
def to_json(self):
|
||||
payload = {
|
||||
'referenceId': self.referenceId,
|
||||
'eventId': self.eventId,
|
||||
'data': self.data.to_dict() if self.data else None
|
||||
}
|
||||
return json.dumps(payload)
|
||||
|
||||
|
||||
26
apps/pyMetadata/clazz/Metadata.py
Normal file
26
apps/pyMetadata/clazz/Metadata.py
Normal file
@ -0,0 +1,26 @@
|
||||
from dataclasses import asdict, dataclass
|
||||
from typing import List, Optional
|
||||
|
||||
@dataclass
|
||||
class Summary:
|
||||
summary: str
|
||||
language: str
|
||||
|
||||
def to_dict(self):
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Metadata:
|
||||
title: str
|
||||
altTitle: List[str]
|
||||
cover: str
|
||||
banner: Optional[str]
|
||||
type: str # Serie/Movie
|
||||
summary: List[Summary]
|
||||
genres: List[str]
|
||||
source: str
|
||||
|
||||
def to_dict(self):
|
||||
return asdict(self)
|
||||
|
||||
0
apps/pyMetadata/clazz/__init__.py
Normal file
0
apps/pyMetadata/clazz/__init__.py
Normal file
74
apps/pyMetadata/clazz/shared.py
Normal file
74
apps/pyMetadata/clazz/shared.py
Normal file
@ -0,0 +1,74 @@
|
||||
|
||||
from typing import Any, List
|
||||
import json
|
||||
|
||||
|
||||
suppress_ignore: List[str] = [
|
||||
"event:media-process:started",
|
||||
"event:request-process:started",
|
||||
"event::save",
|
||||
"event:media-process:completed",
|
||||
"event:work-encode:created",
|
||||
"event:work-extract:created",
|
||||
"event:work-convert:created",
|
||||
"event:work-encode:performed",
|
||||
"event:work-extract:performed",
|
||||
"event:work-convert:performed",
|
||||
"event:media-read-out-cover:performed",
|
||||
"event:work-download-cover:performed",
|
||||
"event:media-read-out-name-and-type:performed",
|
||||
"event:media-parse-stream:performed",
|
||||
"event:media-extract-parameter:created",
|
||||
"event:media-encode-parameter:created",
|
||||
"event:media-metadata-search:performed"
|
||||
]
|
||||
|
||||
consume_on_key: List[str] = [
|
||||
"request:metadata:obtain",
|
||||
"event:media-read-base-info:performed"
|
||||
]
|
||||
|
||||
def decode_key(key_bytes: bytes | None):
|
||||
return key_bytes.decode('utf-8') if key_bytes else None
|
||||
|
||||
def decode_value(value_bytes: bytes | None):
|
||||
return json.loads(value_bytes.decode('utf-8')) if value_bytes else None
|
||||
|
||||
|
||||
|
||||
class ConsumerRecord:
|
||||
topic: str
|
||||
partition: int
|
||||
offset: int
|
||||
key: str
|
||||
value: str | None
|
||||
timestamp: int
|
||||
|
||||
def __init__(self, message: Any) -> None:
|
||||
if message is not None:
|
||||
self.key = message.key
|
||||
self.value = message.value
|
||||
self.topic = message.topic
|
||||
self.offset = message.offset
|
||||
self.partition = message.partition
|
||||
self.timestamp = message.timestamp
|
||||
|
||||
|
||||
class MediaEvent():
|
||||
__consumerRecord: ConsumerRecord
|
||||
referenceId: str
|
||||
eventId: str
|
||||
data: dict | None
|
||||
|
||||
def __init__(self, message: ConsumerRecord) -> None:
|
||||
self.__consumerRecord = message
|
||||
self.referenceId = message.value["referenceId"]
|
||||
self.eventId = message.value["eventId"]
|
||||
self.data = message.value["data"] if "data" in message.value else None
|
||||
|
||||
def isConsumable(self) -> bool:
|
||||
if "status" in self.data:
|
||||
if self.data["status"] == "COMPLETED":
|
||||
return True
|
||||
return False
|
||||
|
||||
60
apps/pyMetadata/sources/source.py
Normal file
60
apps/pyMetadata/sources/source.py
Normal file
@ -0,0 +1,60 @@
|
||||
|
||||
import logging, re
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Tuple
|
||||
|
||||
from fuzzywuzzy import fuzz
|
||||
|
||||
from clazz.Metadata import Metadata
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
class SourceBase(ABC):
|
||||
titles: List[str] = []
|
||||
|
||||
|
||||
def __init__(self, titles: List[str]) -> None:
|
||||
self.titles = titles
|
||||
|
||||
@abstractmethod
|
||||
def search(self, ) -> Metadata | None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def queryIds(self, title: str) -> dict[str, str]:
|
||||
pass
|
||||
|
||||
def isMatchOrPartial(self, source: str | None, title, foundTitle) -> bool:
|
||||
titleParts = re.split(r'[^a-zA-Z0-9\s]', foundTitle)
|
||||
clean_foundTitle: str | None = titleParts[0].strip() if titleParts else None
|
||||
directMatch = fuzz.ratio(title, foundTitle)
|
||||
partialMatch = fuzz.ratio(title, clean_foundTitle) if clean_foundTitle is not None else 0
|
||||
|
||||
if directMatch >= 60:
|
||||
return True
|
||||
elif partialMatch >= 80:
|
||||
log.info(f"{source} -> Partial Match for '{title}' of '{foundTitle}' on part '{clean_foundTitle}' with direct score: {directMatch} and partial {partialMatch}")
|
||||
return True
|
||||
else:
|
||||
log.info(f"{source} -> Match failed for '{title}' of '{foundTitle}' on part '{clean_foundTitle}' with direct score: {directMatch} and partial {partialMatch}")
|
||||
return False
|
||||
|
||||
|
||||
def findBestMatchAcrossTitles(self, idToTitle: dict[str, str], titles: List[str]) -> Tuple[str, str]:
|
||||
best_match_id = ""
|
||||
best_match_title = ""
|
||||
best_ratio = 0
|
||||
|
||||
for title in titles:
|
||||
for id, stored_title in idToTitle.items():
|
||||
ratio = fuzz.ratio(title, stored_title)
|
||||
if ratio > best_ratio:
|
||||
best_ratio = ratio
|
||||
best_match_id = id
|
||||
best_match_title = stored_title
|
||||
|
||||
return best_match_id, best_match_title
|
||||
|
||||
def logNoMatch(self, source: str, titles: List[str]) -> None:
|
||||
combined_titles = ", ".join(titles)
|
||||
log.info(f"No match in source {source} for titles: {combined_titles}")
|
||||
Loading…
Reference in New Issue
Block a user