Added missing files
This commit is contained in:
parent
4f0886b3fd
commit
f8c5c03438
82
apps/pyMetadata/DryRun.py
Normal file
82
apps/pyMetadata/DryRun.py
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
import logging
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
from typing import List, Optional
|
||||||
|
import uuid
|
||||||
|
import threading
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from fuzzywuzzy import fuzz
|
||||||
|
|
||||||
|
from algo.AdvancedMatcher import AdvancedMatcher
|
||||||
|
from algo.SimpleMatcher import SimpleMatcher
|
||||||
|
from algo.PrefixMatcher import PrefixMatcher
|
||||||
|
from clazz.KafkaMessageSchema import KafkaMessage, MessageDataWrapper
|
||||||
|
from clazz.Metadata import Metadata
|
||||||
|
|
||||||
|
from sources.anii import Anii
|
||||||
|
from sources.imdb import Imdb
|
||||||
|
from sources.mal import Mal
|
||||||
|
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||||
|
handlers=[
|
||||||
|
logging.StreamHandler(sys.stdout)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class DryRun():
|
||||||
|
titles: List[str] = []
|
||||||
|
|
||||||
|
def __init__(self, titles: List[str]) -> None:
|
||||||
|
self.titles = titles
|
||||||
|
|
||||||
|
def run(self) -> None:
|
||||||
|
combined_titles = ", ".join(self.titles)
|
||||||
|
logger.info("Searching for %s", combined_titles)
|
||||||
|
result: Metadata | None = self.__getMetadata(self.titles)
|
||||||
|
|
||||||
|
message: str | None = None
|
||||||
|
if (result is None):
|
||||||
|
message = f"No result for {combined_titles}"
|
||||||
|
logger.info(message)
|
||||||
|
|
||||||
|
messageData = MessageDataWrapper(
|
||||||
|
status = "ERROR" if result is None else "COMPLETED",
|
||||||
|
message = message,
|
||||||
|
data = result,
|
||||||
|
derivedFromEventId = None
|
||||||
|
)
|
||||||
|
|
||||||
|
producerMessage = KafkaMessage(referenceId="DryRun..", data=messageData).to_json()
|
||||||
|
logger.info(producerMessage)
|
||||||
|
|
||||||
|
def __getMetadata(self, titles: List[str]) -> Metadata | None:
|
||||||
|
mal = Mal(titles=titles)
|
||||||
|
anii = Anii(titles=titles)
|
||||||
|
imdb = Imdb(titles=titles)
|
||||||
|
|
||||||
|
results: List[Metadata] = [
|
||||||
|
mal.search(),
|
||||||
|
anii.search(),
|
||||||
|
imdb.search()
|
||||||
|
]
|
||||||
|
filtered_results = [result for result in results if result is not None]
|
||||||
|
logger.info("Simple matcher")
|
||||||
|
simpleSelector = SimpleMatcher(titles=titles, metadata=filtered_results).getBestMatch()
|
||||||
|
logger.info("Advanced matcher")
|
||||||
|
advancedSelector = AdvancedMatcher(titles=titles, metadata=filtered_results).getBestMatch()
|
||||||
|
logger.info("Prefrix matcher")
|
||||||
|
prefixSelector = PrefixMatcher(titles=titles, metadata=filtered_results).getBestMatch()
|
||||||
|
if prefixSelector is not None:
|
||||||
|
return prefixSelector
|
||||||
|
if simpleSelector is not None:
|
||||||
|
return simpleSelector
|
||||||
|
if advancedSelector is not None:
|
||||||
|
return advancedSelector
|
||||||
|
return None
|
||||||
31
apps/pyMetadata/algo/AdvancedMatcher.py
Normal file
31
apps/pyMetadata/algo/AdvancedMatcher.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
from fuzzywuzzy import fuzz
|
||||||
|
from .AlgorithmBase import AlgorithmBase, MatchResult
|
||||||
|
from clazz.Metadata import Metadata
|
||||||
|
|
||||||
|
class AdvancedMatcher(AlgorithmBase):
|
||||||
|
def getBestMatch(self) -> Metadata | None:
|
||||||
|
best_match = None
|
||||||
|
best_score = -1
|
||||||
|
match_results = []
|
||||||
|
|
||||||
|
for title in self.titles:
|
||||||
|
for metadata in self.metadata:
|
||||||
|
# Compute different match ratios
|
||||||
|
title_ratio = fuzz.token_sort_ratio(title.lower(), metadata.title.lower())
|
||||||
|
alt_title_ratios = [fuzz.token_sort_ratio(title.lower(), alt_title.lower()) for alt_title in metadata.altTitle]
|
||||||
|
max_alt_title_ratio = max(alt_title_ratios) if alt_title_ratios else 0
|
||||||
|
|
||||||
|
# Combine ratios as desired
|
||||||
|
combined_score = max(title_ratio, max_alt_title_ratio)
|
||||||
|
|
||||||
|
match_results.append(MatchResult(title, metadata.title, combined_score, metadata.source, metadata))
|
||||||
|
|
||||||
|
# Update best match if this one is better
|
||||||
|
if combined_score > best_score:
|
||||||
|
best_score = combined_score
|
||||||
|
best_match = metadata if combined_score >= 70 else None
|
||||||
|
|
||||||
|
# Print match summary
|
||||||
|
self.print_match_summary(match_results)
|
||||||
|
|
||||||
|
return best_match
|
||||||
33
apps/pyMetadata/algo/AlgorithmBase.py
Normal file
33
apps/pyMetadata/algo/AlgorithmBase.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from fuzzywuzzy import fuzz, process
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
from clazz.Metadata import Metadata
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MatchResult:
|
||||||
|
title: str
|
||||||
|
matched_title: str
|
||||||
|
score: int
|
||||||
|
source: str
|
||||||
|
data: Metadata
|
||||||
|
|
||||||
|
|
||||||
|
class AlgorithmBase(ABC):
|
||||||
|
def __init__(self, titles: List[str], metadata: List[Metadata]):
|
||||||
|
self.titles = titles
|
||||||
|
self.metadata = metadata
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def getBestMatch(self) -> Metadata | None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def print_match_summary(self, match_results: List[MatchResult]):
|
||||||
|
headers = ["Title", "Matched Title", "Score", "Source"]
|
||||||
|
data = [(result.title, result.matched_title, result.score, result.source) for result in match_results]
|
||||||
|
print(tabulate(data, headers=headers))
|
||||||
54
apps/pyMetadata/algo/PrefixMatcher.py
Normal file
54
apps/pyMetadata/algo/PrefixMatcher.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
import re
|
||||||
|
from typing import List, Optional
|
||||||
|
from fuzzywuzzy import fuzz, process
|
||||||
|
from .AlgorithmBase import AlgorithmBase, MatchResult
|
||||||
|
from clazz.Metadata import Metadata
|
||||||
|
|
||||||
|
|
||||||
|
class PrefixMatcher(AlgorithmBase):
|
||||||
|
|
||||||
|
def preprocess_text(self, text: str) -> str:
|
||||||
|
unitext = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
|
||||||
|
return unitext.strip().lower()
|
||||||
|
|
||||||
|
def source_priority(self, source: str) -> int:
|
||||||
|
priority_map = {'mal': 1, 'anii': 2, 'imdb': 3}
|
||||||
|
return priority_map.get(source, 4)
|
||||||
|
|
||||||
|
def getBestMatch(self) -> Optional[Metadata]:
|
||||||
|
best_match = None
|
||||||
|
best_score = -1
|
||||||
|
match_results: List[MatchResult] = []
|
||||||
|
|
||||||
|
for title in self.titles:
|
||||||
|
preprocessed_title = self.preprocess_text(title)[:1]
|
||||||
|
|
||||||
|
for metadata in self.metadata:
|
||||||
|
preprocessed_metadata_title = self.preprocess_text(metadata.title)[:1]
|
||||||
|
|
||||||
|
# Match against metadata title
|
||||||
|
score = fuzz.token_sort_ratio(preprocessed_title, preprocessed_metadata_title)
|
||||||
|
match_results.append(MatchResult(title, metadata.title, score, metadata.source, metadata))
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_match = metadata if score >= 70 else None
|
||||||
|
|
||||||
|
# Match against metadata altTitles
|
||||||
|
for alt_title in metadata.altTitle:
|
||||||
|
preprocessed_alt_title = self.preprocess_text(alt_title)[:1]
|
||||||
|
alt_score = fuzz.token_sort_ratio(preprocessed_title, preprocessed_alt_title)
|
||||||
|
match_results.append(MatchResult(title, alt_title, alt_score, metadata.source, metadata))
|
||||||
|
if alt_score > best_score:
|
||||||
|
best_score = alt_score
|
||||||
|
best_match = metadata if alt_score >= 70 else None
|
||||||
|
|
||||||
|
match_results.sort(key=lambda x: (-x.score, self.source_priority(x.source)))
|
||||||
|
|
||||||
|
# Print match summary
|
||||||
|
self.print_match_summary(match_results)
|
||||||
|
|
||||||
|
if match_results:
|
||||||
|
top_result = match_results[0].data
|
||||||
|
return top_result
|
||||||
|
|
||||||
|
return best_match
|
||||||
33
apps/pyMetadata/algo/SimpleMatcher.py
Normal file
33
apps/pyMetadata/algo/SimpleMatcher.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
|
||||||
|
from fuzzywuzzy import fuzz, process
|
||||||
|
from .AlgorithmBase import AlgorithmBase, MatchResult
|
||||||
|
from clazz.Metadata import Metadata
|
||||||
|
|
||||||
|
|
||||||
|
class SimpleMatcher(AlgorithmBase):
|
||||||
|
def getBestMatch(self) -> Metadata | None:
|
||||||
|
best_match = None
|
||||||
|
best_score = -1
|
||||||
|
match_results = []
|
||||||
|
|
||||||
|
for title in self.titles:
|
||||||
|
for metadata in self.metadata:
|
||||||
|
# Match against metadata title
|
||||||
|
score = fuzz.token_sort_ratio(title.lower(), metadata.title.lower())
|
||||||
|
match_results.append(MatchResult(title, metadata.title, score, metadata.source, metadata))
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_match = metadata if score >= 70 else None
|
||||||
|
|
||||||
|
# Match against metadata altTitles
|
||||||
|
for alt_title in metadata.altTitle:
|
||||||
|
alt_score = fuzz.token_sort_ratio(title.lower(), alt_title.lower())
|
||||||
|
match_results.append(MatchResult(title, alt_title, alt_score, metadata.source, metadata))
|
||||||
|
if alt_score > best_score:
|
||||||
|
best_score = alt_score
|
||||||
|
best_match = metadata if alt_score >= 70 else None
|
||||||
|
|
||||||
|
# Print match summary
|
||||||
|
self.print_match_summary(match_results)
|
||||||
|
|
||||||
|
return best_match
|
||||||
103
apps/pyMetadata/algo/SourceWeighted.py
Normal file
103
apps/pyMetadata/algo/SourceWeighted.py
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List, Optional
|
||||||
|
from fuzzywuzzy import fuzz
|
||||||
|
from unidecode import unidecode
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
from clazz.Metadata import Metadata
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class WeightedData:
|
||||||
|
result: Metadata
|
||||||
|
weight: float
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DataAndScore:
|
||||||
|
result: Metadata
|
||||||
|
score: float
|
||||||
|
weight: float
|
||||||
|
matched_title: str
|
||||||
|
|
||||||
|
|
||||||
|
class UseSource:
|
||||||
|
titles: List[str] = []
|
||||||
|
dataWeighed: List[WeightedData] = []
|
||||||
|
|
||||||
|
def __init__(self, titles: List[str], mal: Optional[Metadata] = None, imdb: Optional[Metadata] = None, anii: Optional[Metadata] = None) -> None:
|
||||||
|
self.titles = titles
|
||||||
|
if mal is not None:
|
||||||
|
self.dataWeighed.append(WeightedData(mal, 1.5))
|
||||||
|
|
||||||
|
if imdb is not None:
|
||||||
|
self.dataWeighed.append(WeightedData(imdb, 1))
|
||||||
|
|
||||||
|
if anii is not None:
|
||||||
|
self.dataWeighed.append(WeightedData(anii, 1.3))
|
||||||
|
|
||||||
|
|
||||||
|
def stripped(self, input_string) -> str:
|
||||||
|
unitext = unidecode(input_string)
|
||||||
|
unitext = re.sub(r'[^a-zA-Z0-9\s]', ' ', unitext)
|
||||||
|
unitext = re.sub(r'\s{2,}', ' ', unitext)
|
||||||
|
return unitext.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def __calculate_score(self, title: str, weightData: List[WeightedData]) -> List[DataAndScore]:
|
||||||
|
result: List[DataAndScore] = []
|
||||||
|
|
||||||
|
for title_to_check in self.titles:
|
||||||
|
for wd in weightData:
|
||||||
|
if wd.result is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
highScore = fuzz.ratio(self.stripped(title_to_check.lower()), self.stripped(wd.result.title.lower()))
|
||||||
|
for alt_title in wd.result.altTitle:
|
||||||
|
altScore = fuzz.ratio(self.stripped(title_to_check.lower()), self.stripped(alt_title.lower()))
|
||||||
|
if altScore > highScore:
|
||||||
|
highScore = altScore
|
||||||
|
givenScore = highScore * wd.weight
|
||||||
|
result.append(DataAndScore(wd.result, givenScore, wd.weight, title_to_check))
|
||||||
|
|
||||||
|
result.sort(key=lambda x: x.score, reverse=True)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def select_result_table(self) -> Optional[pd.DataFrame]:
|
||||||
|
scoredResults = []
|
||||||
|
for title in self.titles:
|
||||||
|
scoredResult = self.__calculate_score(title=title, weightData=self.dataWeighed)
|
||||||
|
scoredResults.append(scoredResult)
|
||||||
|
|
||||||
|
all_results = [item for sublist in scoredResults for item in sublist]
|
||||||
|
|
||||||
|
if not all_results:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Prepare data for DataFrame
|
||||||
|
data = {
|
||||||
|
"Title": [],
|
||||||
|
"Alt Title": [],
|
||||||
|
"Score": [],
|
||||||
|
"Weight": [],
|
||||||
|
"Matched Title": []
|
||||||
|
}
|
||||||
|
|
||||||
|
for ds in all_results:
|
||||||
|
metadata = ds.result
|
||||||
|
data["Title"].append(metadata.title)
|
||||||
|
data["Alt Title"].append(", ".join(metadata.altTitle))
|
||||||
|
data["Score"].append(ds.score)
|
||||||
|
data["Weight"].append(ds.weight)
|
||||||
|
data["Matched Title"].append(ds.matched_title)
|
||||||
|
|
||||||
|
df = pd.DataFrame(data)
|
||||||
|
df = df.sort_values(by="Score", ascending=False).reset_index(drop=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
df.to_json(f"./logs/{self.titles[0]}.json", orient="records", indent=4)
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Failed to dump JSON: {e}")
|
||||||
|
|
||||||
|
return df
|
||||||
0
apps/pyMetadata/algo/__init__.py
Normal file
0
apps/pyMetadata/algo/__init__.py
Normal file
@ -9,9 +9,9 @@ import json
|
|||||||
import time
|
import time
|
||||||
from fuzzywuzzy import fuzz
|
from fuzzywuzzy import fuzz
|
||||||
|
|
||||||
from algo.AdvancedMatcher import AdvancedMatcher
|
from .algo.AdvancedMatcher import AdvancedMatcher
|
||||||
from algo.SimpleMatcher import SimpleMatcher
|
from .algo.SimpleMatcher import SimpleMatcher
|
||||||
from algo.PrefixMatcher import PrefixMatcher
|
from .algo.PrefixMatcher import PrefixMatcher
|
||||||
from clazz.shared import ConsumerRecord, MediaEvent, decode_key, decode_value, suppress_ignore, consume_on_key
|
from clazz.shared import ConsumerRecord, MediaEvent, decode_key, decode_value, suppress_ignore, consume_on_key
|
||||||
from clazz.KafkaMessageSchema import KafkaMessage, MessageDataWrapper
|
from clazz.KafkaMessageSchema import KafkaMessage, MessageDataWrapper
|
||||||
from clazz.Metadata import Metadata
|
from clazz.Metadata import Metadata
|
||||||
|
|||||||
38
apps/pyMetadata/clazz/KafkaMessageSchema.py
Normal file
38
apps/pyMetadata/clazz/KafkaMessageSchema.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
|
||||||
|
|
||||||
|
from dataclasses import asdict, dataclass
|
||||||
|
import uuid, json
|
||||||
|
|
||||||
|
from .Metadata import Metadata
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MessageDataWrapper:
|
||||||
|
status: str # COMPLETED / ERROR
|
||||||
|
message: str | None
|
||||||
|
data: Metadata | None
|
||||||
|
derivedFromEventId: str | None
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return asdict(self)
|
||||||
|
|
||||||
|
|
||||||
|
class KafkaMessage:
|
||||||
|
referenceId: str
|
||||||
|
eventId: str = str(uuid.uuid4())
|
||||||
|
data: MessageDataWrapper
|
||||||
|
|
||||||
|
def __init__(self, referenceId: str, data: MessageDataWrapper) -> None:
|
||||||
|
self.referenceId = referenceId
|
||||||
|
self.data = data
|
||||||
|
pass
|
||||||
|
|
||||||
|
def to_json(self):
|
||||||
|
payload = {
|
||||||
|
'referenceId': self.referenceId,
|
||||||
|
'eventId': self.eventId,
|
||||||
|
'data': self.data.to_dict() if self.data else None
|
||||||
|
}
|
||||||
|
return json.dumps(payload)
|
||||||
|
|
||||||
|
|
||||||
26
apps/pyMetadata/clazz/Metadata.py
Normal file
26
apps/pyMetadata/clazz/Metadata.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
from dataclasses import asdict, dataclass
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Summary:
|
||||||
|
summary: str
|
||||||
|
language: str
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return asdict(self)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Metadata:
|
||||||
|
title: str
|
||||||
|
altTitle: List[str]
|
||||||
|
cover: str
|
||||||
|
banner: Optional[str]
|
||||||
|
type: str # Serie/Movie
|
||||||
|
summary: List[Summary]
|
||||||
|
genres: List[str]
|
||||||
|
source: str
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return asdict(self)
|
||||||
|
|
||||||
0
apps/pyMetadata/clazz/__init__.py
Normal file
0
apps/pyMetadata/clazz/__init__.py
Normal file
74
apps/pyMetadata/clazz/shared.py
Normal file
74
apps/pyMetadata/clazz/shared.py
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
|
||||||
|
from typing import Any, List
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
suppress_ignore: List[str] = [
|
||||||
|
"event:media-process:started",
|
||||||
|
"event:request-process:started",
|
||||||
|
"event::save",
|
||||||
|
"event:media-process:completed",
|
||||||
|
"event:work-encode:created",
|
||||||
|
"event:work-extract:created",
|
||||||
|
"event:work-convert:created",
|
||||||
|
"event:work-encode:performed",
|
||||||
|
"event:work-extract:performed",
|
||||||
|
"event:work-convert:performed",
|
||||||
|
"event:media-read-out-cover:performed",
|
||||||
|
"event:work-download-cover:performed",
|
||||||
|
"event:media-read-out-name-and-type:performed",
|
||||||
|
"event:media-parse-stream:performed",
|
||||||
|
"event:media-extract-parameter:created",
|
||||||
|
"event:media-encode-parameter:created",
|
||||||
|
"event:media-metadata-search:performed"
|
||||||
|
]
|
||||||
|
|
||||||
|
consume_on_key: List[str] = [
|
||||||
|
"request:metadata:obtain",
|
||||||
|
"event:media-read-base-info:performed"
|
||||||
|
]
|
||||||
|
|
||||||
|
def decode_key(key_bytes: bytes | None):
|
||||||
|
return key_bytes.decode('utf-8') if key_bytes else None
|
||||||
|
|
||||||
|
def decode_value(value_bytes: bytes | None):
|
||||||
|
return json.loads(value_bytes.decode('utf-8')) if value_bytes else None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class ConsumerRecord:
|
||||||
|
topic: str
|
||||||
|
partition: int
|
||||||
|
offset: int
|
||||||
|
key: str
|
||||||
|
value: str | None
|
||||||
|
timestamp: int
|
||||||
|
|
||||||
|
def __init__(self, message: Any) -> None:
|
||||||
|
if message is not None:
|
||||||
|
self.key = message.key
|
||||||
|
self.value = message.value
|
||||||
|
self.topic = message.topic
|
||||||
|
self.offset = message.offset
|
||||||
|
self.partition = message.partition
|
||||||
|
self.timestamp = message.timestamp
|
||||||
|
|
||||||
|
|
||||||
|
class MediaEvent():
|
||||||
|
__consumerRecord: ConsumerRecord
|
||||||
|
referenceId: str
|
||||||
|
eventId: str
|
||||||
|
data: dict | None
|
||||||
|
|
||||||
|
def __init__(self, message: ConsumerRecord) -> None:
|
||||||
|
self.__consumerRecord = message
|
||||||
|
self.referenceId = message.value["referenceId"]
|
||||||
|
self.eventId = message.value["eventId"]
|
||||||
|
self.data = message.value["data"] if "data" in message.value else None
|
||||||
|
|
||||||
|
def isConsumable(self) -> bool:
|
||||||
|
if "status" in self.data:
|
||||||
|
if self.data["status"] == "COMPLETED":
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
60
apps/pyMetadata/sources/source.py
Normal file
60
apps/pyMetadata/sources/source.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
|
||||||
|
import logging, re
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
|
from fuzzywuzzy import fuzz
|
||||||
|
|
||||||
|
from clazz.Metadata import Metadata
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class SourceBase(ABC):
|
||||||
|
titles: List[str] = []
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, titles: List[str]) -> None:
|
||||||
|
self.titles = titles
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def search(self, ) -> Metadata | None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def queryIds(self, title: str) -> dict[str, str]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def isMatchOrPartial(self, source: str | None, title, foundTitle) -> bool:
|
||||||
|
titleParts = re.split(r'[^a-zA-Z0-9\s]', foundTitle)
|
||||||
|
clean_foundTitle: str | None = titleParts[0].strip() if titleParts else None
|
||||||
|
directMatch = fuzz.ratio(title, foundTitle)
|
||||||
|
partialMatch = fuzz.ratio(title, clean_foundTitle) if clean_foundTitle is not None else 0
|
||||||
|
|
||||||
|
if directMatch >= 60:
|
||||||
|
return True
|
||||||
|
elif partialMatch >= 80:
|
||||||
|
log.info(f"{source} -> Partial Match for '{title}' of '{foundTitle}' on part '{clean_foundTitle}' with direct score: {directMatch} and partial {partialMatch}")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
log.info(f"{source} -> Match failed for '{title}' of '{foundTitle}' on part '{clean_foundTitle}' with direct score: {directMatch} and partial {partialMatch}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def findBestMatchAcrossTitles(self, idToTitle: dict[str, str], titles: List[str]) -> Tuple[str, str]:
|
||||||
|
best_match_id = ""
|
||||||
|
best_match_title = ""
|
||||||
|
best_ratio = 0
|
||||||
|
|
||||||
|
for title in titles:
|
||||||
|
for id, stored_title in idToTitle.items():
|
||||||
|
ratio = fuzz.ratio(title, stored_title)
|
||||||
|
if ratio > best_ratio:
|
||||||
|
best_ratio = ratio
|
||||||
|
best_match_id = id
|
||||||
|
best_match_title = stored_title
|
||||||
|
|
||||||
|
return best_match_id, best_match_title
|
||||||
|
|
||||||
|
def logNoMatch(self, source: str, titles: List[str]) -> None:
|
||||||
|
combined_titles = ", ".join(titles)
|
||||||
|
log.info(f"No match in source {source} for titles: {combined_titles}")
|
||||||
Loading…
Reference in New Issue
Block a user