Regex parsing og media file name

This commit is contained in:
Brage Skjønborg 2025-12-08 18:52:14 +01:00
parent e84a6494aa
commit b32ff8ce4f
2 changed files with 263 additions and 102 deletions

View File

@ -40,10 +40,12 @@ class MediaEventParsedInfoListener : EventListener() {
fun String.noResolutionAndAfter() = Regex("[0-9]+[pk].*", RegexOption.IGNORE_CASE).replace(this, "")
fun String.noSourceTags() =
Regex("(?i)(bluray|laserdisc|dvd|web|uhd|hd|htds|imax).*", RegexOption.IGNORE_CASE).replace(this, " ")
fun String.noUnderscores() = this.replace("_", " ")
fun String.noYear() = Regex("\\b\\d{4}\\b").replace(this.takeIf { !it.matches(Regex("^\\d{4}")) } ?: this, "")
fun String.noDots() = Regex("(?<!\\b(?:Dr|Mr|Ms|Mrs|Lt|Capt|Prof|St|Ave))\\.").replace(this, " ")
fun String.noExtraSpaces() = Regex("\\s{2,}").replace(this, " ")
fun String.fullTrim() = this.trim('.', ',', ' ', '_', '-')
enum class MediaType {
Movie,
@ -55,14 +57,15 @@ class MediaEventParsedInfoListener : EventListener() {
// Serie-mønstre: dekker alle vanlige shorthand og varianter
val seriesPatterns = listOf(
Regex("s\\d{1,2}e\\d{1,2}"), // S01E03, s1e5
Regex("\\d{1,2}x\\d{1,2}"), // 1x03, 2x10
Regex("season\\s*\\d+"), // Season 2
Regex("episode\\s*\\d+"), // Episode 5
Regex("ep\\s*\\d+"), // Ep05, Ep 5
Regex("s\\d{1,2}\\s*[- ]\\s*e\\d{1,2}"), // S1 - E5, S01 - E05
Regex("s\\d{1,2}\\s*ep\\s*\\d{1,2}"), // S1 Ep05
Regex("series\\s*\\d+"), // Series 2 (britisk stil)
Regex("s\\d{1,2}e\\d{1,2}"), // S01E03, s1e5
Regex("\\d{1,2}x\\d{1,2}"), // 1x03, 2x10
Regex("season\\s*\\d+"), // Season 2
Regex("episode\\s*\\d+"), // Episode 5
Regex("ep\\s*\\d+"), // Ep05, Ep 5
Regex("s\\d{1,2}\\s*[- ]\\s*e\\d{1,2}"), // S1 - E5, S01 - E05
Regex("s\\d{1,2}\\s*ep\\s*\\d{1,2}"), // S1 Ep05
Regex("series\\s*\\d+"), // Series 2 (britisk stil)
Regex("s\\d{1,2}[. ]e\\d{1,2}") // S01.E02 eller S01 E02
)
if (seriesPatterns.any { it.containsMatchIn(name) }) {
@ -89,7 +92,7 @@ class MediaEventParsedInfoListener : EventListener() {
}
fun File.getDesiredCollection(): String {
val collection = when (this.guessMovieOrSeries()) {
val collection = when (this.guessMovieOrSeries()) {
MediaType.Movie -> this.guessDesiredMovieTitle()
MediaType.Serie -> this.guessDesiredSerieTitle()
}
@ -122,22 +125,43 @@ class MediaEventParsedInfoListener : EventListener() {
val seasonRegex = Regex("""(?i)(?:S|Season|Series)\s*(\d{1,2})""")
val episodeRegex = Regex("""(?i)(?:E|Episode|Ep)\s*(\d{1,3})""")
val revisionRegex = Regex("""(?i)\bv(\d+)\b""")
val seasonEpisodeRegex = Regex("""(?i)(\d{1,2})x(\d{1,2})(?:[vV](\d+))?""")
val seasonMatch = seasonRegex.find(raw)
val episodeMatch = episodeRegex.find(raw)
val revisionMatch = revisionRegex.find(raw)
var season: Int? = null
var episode: Int? = null
var revision: Int? = null
var baseTitle = raw.getCleanedTitle()
var episodeTitle = ""
val season = seasonMatch?.groupValues?.get(1)?.toIntOrNull()
val episode = episodeMatch?.groupValues?.get(1)?.toIntOrNull()
val revision = revisionMatch?.groupValues?.get(1)?.toIntOrNull()
val seMatch = seasonEpisodeRegex.find(raw)
if (seMatch != null) {
season = seMatch.groupValues[1].toIntOrNull()
episode = seMatch.groupValues[2].toIntOrNull()
revision = seMatch.groupValues.getOrNull(3)?.toIntOrNull()
baseTitle = raw.substring(0, seMatch.range.first).getCleanedTitle()
episodeTitle = raw.substring(seMatch.range.last + 1).getCleanedTitle()
} else {
val seasonMatch = seasonRegex.find(raw)
val episodeMatch = episodeRegex.find(raw)
val revisionMatch = revisionRegex.find(raw)
val baseTitle = if (seasonMatch != null) {
raw.substring(0, seasonMatch.range.first).getCleanedTitle()
} else raw.getCleanedTitle()
season = seasonMatch?.groupValues?.get(1)?.toIntOrNull()
episode = episodeMatch?.groupValues?.get(1)?.toIntOrNull()
revision = revisionMatch?.groupValues?.get(1)?.toIntOrNull()
val episodeTitle = if (episodeMatch != null) {
raw.substring(episodeMatch.range.last + 1).getCleanedTitle()
} else ""
baseTitle = if (seasonMatch != null) {
raw.substring(0, seasonMatch.range.first).getCleanedTitle()
} else raw.getCleanedTitle()
episodeTitle = if (episodeMatch != null) {
raw.substring(episodeMatch.range.last + 1).getCleanedTitle()
} else ""
}
// Fallback: hvis baseTitle er tom eller bare inneholder S/E, bruk parentmappe
if (baseTitle.isBlank() || baseTitle.matches(Regex("""(?i)^s?\d+e?\d+$"""))) {
baseTitle = this.parentFile?.name?.getCleanedTitle() ?: "Dumb ways to die"
}
val tag = buildString {
append("S${(season ?: 1).toString().padStart(2, '0')}")
@ -159,29 +183,35 @@ class MediaEventParsedInfoListener : EventListener() {
fun File.guessSearchableTitle(): List<String> {
val cleaned = this.guessDesiredFileName().noParens()
.let {
val regex = "\\((?!\\d{4}\\))(?>[^()]+|\\b)\\)"
Regex(regex).replace(it, "")
}
val cleaned = this.guessDesiredFileName()
.noResolutionAndAfter()
.noSourceTags()
.noDots()
.noExtraSpaces()
.trim('.', ',', ' ')
.fullTrim()
val titles = mutableListOf<String>()
// 1. Første del før bindestrek
val yearRegex = Regex("""\b(19|20)\d{2}\b""")
val hasYear = yearRegex.containsMatchIn(cleaned)
// 1. Hvis årstall finnes, legg hele cleaned først
if (hasYear) {
titles.add(cleaned)
}
// 2. Første del før bindestrek
val firstPart = cleaned.split(" - ").firstOrNull()?.trim() ?: cleaned
titles.add(firstPart)
// 2. Hele cleaned
titles.add(cleaned)
// 3. Hele cleaned (hvis ikke allerede lagt inn først)
if (!hasYear) {
titles.add(cleaned)
}
// 3. Fjern årstall hvis det finnes
val yearRegex = Regex("""\b(19|20)\d{2}\b""")
val noYear = yearRegex.replace(cleaned, "").trim()
// 4. Variant uten årstall
val noYear = yearRegex.replace(cleaned, "")
.noParens().trim()
if (noYear.isNotEmpty() && noYear != cleaned) {
titles.add(noYear)
}
@ -190,5 +220,4 @@ class MediaEventParsedInfoListener : EventListener() {
}
}

View File

@ -11,14 +11,6 @@ import java.io.File
class MediaEventParsedInfoListenerTest : MediaEventParsedInfoListener() {
@MethodSource("fileNameSanitizeTest")
@ParameterizedTest(name = "{0}")
fun fileNameSanitizeTest(testCase: SanitizeTestCase) {
val parser = FileNameParser(testCase.input)
val result = parser.guessDesiredFileName()
assertThat(result).isEqualTo(testCase.expected)
}
@MethodSource("parsedInfoTest")
@ParameterizedTest(name = "{0}")
fun parsedInfoTest(testCase: ParsedInfoTestCase) {
@ -39,12 +31,6 @@ class MediaEventParsedInfoListenerTest : MediaEventParsedInfoListener() {
assertThat(mediaType).isEqualTo(testCase.expectedType)
}
data class SanitizeTestCase(
val input: String,
val expected: String
)
data class ParsedInfoTestCase(
val file: File,
val expectedTitle: String,
@ -58,61 +44,10 @@ class MediaEventParsedInfoListenerTest : MediaEventParsedInfoListener() {
)
companion object {
@JvmStatic
fun fileNameSanitizeTest() = listOf(
Named.of(
"Basic sanitization",
SanitizeTestCase(
input = "Fancy.Thomas.S03E03.Enemy.1080p.AMAZING.WEB-VALUE.DDP5AN.1.H.264",
expected = "Fancy Thomas S03E03 Enemy"
)
),
Named.of(
"Name with numbers",
SanitizeTestCase(
input = "[TST] Fancy Name Test 99 - 01 [Nans][#00A8E6]",
expected = "Fancy Name Test 99 - 01"
)
),
Named.of(
"Dot removal and special characters",
SanitizeTestCase(
input = "Like.a.Potato.Chef.S01E01.Departure.\\u0026.Skills.1080p.Potato",
expected = "Like a Potato Chef S01E01 Departure \\u0026 Skills"
)
),
Named.of(
"Movie name with numbers",
SanitizeTestCase(
input = "Wicket.Wicker.Potato.4.2023.UHD.BluRay.2160p",
expected = "Wicket Wicker Potato 4"
)
),
Named.of(
"Movie with extended title",
SanitizeTestCase(
input = "Potato-Pass Movie - Skinke",
expected = "Potato-Pass Movie - Skinke"
)
),
Named.of(
"Title with year in parentheses",
SanitizeTestCase(
input = "Amazing Potato (2022) 1080p BluRay",
expected = "Amazing Potato"
)
),
Named.of(
"Same",
SanitizeTestCase(
input = "S01E03-How to unlucky i am",
expected = "S01E03-How to unlucky i am"
)
)
)
@JvmStatic
fun parsedInfoTest() = listOf(
// existing parsed cases
Named.of(
"Series episode parsing",
ParsedInfoTestCase(
@ -148,6 +83,154 @@ class MediaEventParsedInfoListenerTest : MediaEventParsedInfoListener() {
expectedFileName = "Potato-Pass Movie - Skinke",
expectedSearchTitles = listOf("Potato-Pass Movie", "Potato-Pass Movie - Skinke")
)
),
Named.of(
"Name with numbers",
ParsedInfoTestCase(
file = File("[TST] Fancy Name Test 99 - 01 [Nans][#00A8E6].mkv"),
expectedTitle = "Fancy Name Test 99",
expectedFileName = "Fancy Name Test 99 - 01",
expectedSearchTitles = listOf("Fancy Name Test 99", "Fancy Name Test 99 - 01")
)
),
Named.of(
"Movie name with numbers",
ParsedInfoTestCase(
file = File("Wicket.Wicker.Potato.4.2023.UHD.BluRay.2160p.mkv"),
expectedTitle = "Wicket Wicker Potato 4",
expectedFileName = "Wicket Wicker Potato 4",
expectedSearchTitles = listOf("Wicket Wicker Potato 4")
)
),
Named.of(
"Title with year in parentheses",
ParsedInfoTestCase(
file = File("Amazing Potato (2022) 1080p BluRay.mkv"),
expectedTitle = "Amazing Potato",
expectedFileName = "Amazing Potato",
expectedSearchTitles = listOf("Amazing Potato")
)
),
Named.of(
"Same",
ParsedInfoTestCase(
file = File("/Dumb ways to die/S01E03-How to unlucky i am.mkv"),
expectedTitle = "Dumb ways to die",
expectedFileName = "Dumb ways to die - S01E03 - How to unlucky i am",
expectedSearchTitles = listOf(
"Dumb ways to die",
"Dumb ways to die - S01E03 - How to unlucky i am"
)
)
),
Named.of(
"Underscores and mixed tags",
ParsedInfoTestCase(
file = File("my_movie_title_2019_1080p_x264_YTS.mkv"),
expectedTitle = "my movie title",
expectedFileName = "my movie title (2019)",
expectedSearchTitles = listOf("my movie title (2019)", "my movie title")
)
),
Named.of(
"Multiple bracketed groups and release tags",
ParsedInfoTestCase(
file = File("[GROUP][WEBRip][YTS]Some.Movie.Title.720p.WEBRip.x264.AAC-[eztv].mkv"),
expectedTitle = "Some Movie Title",
expectedFileName = "Some Movie Title",
expectedSearchTitles = listOf("Some Movie Title")
)
),
Named.of(
"Remux, PROPER, REPACK and extras",
ParsedInfoTestCase(
file = File("Cool.Movie.2018.1080p.BluRay.REMUX.PROPER.REPACK.READNFO-GRP.mkv"),
expectedTitle = "Cool Movie",
expectedFileName = "Cool Movie",
expectedSearchTitles = listOf("Cool Movie")
)
),
Named.of(
"Hyphens and multiple dashes",
ParsedInfoTestCase(
file = File("Potato-Fields_-_A-Strange.Day-2017-HDTV-720p.mkv"),
expectedTitle = "Potato-Fields",
expectedFileName = "Potato-Fields - A-Strange Day",
expectedSearchTitles = listOf("Potato-Fields", "Potato-Fields - A-Strange Day")
)
),
Named.of(
"Trailing group and site tags",
ParsedInfoTestCase(
file = File("Movie.Name.2015.1080p.BluRay.x264-[YTS.MX].mkv"),
expectedTitle = "Movie Name",
expectedFileName = "Movie Name",
expectedSearchTitles = listOf("Movie Name")
)
),
Named.of(
"IMAX and UNRATED markers",
ParsedInfoTestCase(
file = File("Epic.Film.IMAX.UNRATED.2019.2160p.HDR.HEVC.mkv"),
expectedTitle = "Epic Film",
expectedFileName = "Epic Film",
expectedSearchTitles = listOf("Epic Film")
)
),
Named.of(
"Sample and Trailer should be stripped",
ParsedInfoTestCase(
file = File("Amazing.Movie.2020.1080p.Trailer-SAMPLE.mp4"),
expectedTitle = "Amazing Movie",
expectedFileName = "Amazing Movie",
expectedSearchTitles = listOf("Amazing Movie")
)
),
Named.of(
"Parentheses director's cut",
ParsedInfoTestCase(
file = File("The.Great.Film.(Director's.Cut).2016.1080p.BluRay.mkv"),
expectedTitle = "The Great Film",
expectedFileName = "The Great Film",
expectedSearchTitles = listOf("The Great Film")
)
),
Named.of(
"Mixed separators and version tags",
ParsedInfoTestCase(
file = File("Show.Name.S01.E02.720p.HDTV.x264-Group_v2.mkv"),
expectedTitle = "Show Name",
expectedFileName = "Show Name - S01E02",
expectedSearchTitles = listOf("Show Name", "Show Name - S01E02")
)
),
Named.of(
"Square brackets year and tags",
ParsedInfoTestCase(
file = File("Title [2014] [1080p] [BluRay] [ENG].mkv"),
expectedTitle = "Title",
expectedFileName = "Title",
expectedSearchTitles = listOf("Title")
)
),
Named.of(
"Version suffixes and fix tags",
ParsedInfoTestCase(
file = File("Movie.Title.720p.HDTV.x264-FLEET.fix.mkv"),
expectedTitle = "Movie Title",
expectedFileName = "Movie Title",
expectedSearchTitles = listOf("Movie Title")
)
),
Named.of(
"Nested brackets and group names",
ParsedInfoTestCase(
file = File("[HD] (2020) Weird.Movie.Title - Extended.Edition [Group-Name].mkv"),
expectedTitle = "Weird Movie Title",
expectedFileName = "Weird Movie Title - Extended Edition",
expectedSearchTitles = listOf("Weird Movie Title", "Weird Movie Title - Extended Edition")
)
)
)
@ -188,8 +271,57 @@ class MediaEventParsedInfoListenerTest : MediaEventParsedInfoListener() {
expectedType = MediaType.Movie
)
),
// Additional parse/dumb filename cases
Named.of(
"Lowercase sXe pattern",
ParseVideoTypeTestCase(
file = File("weird_show.s01e02.720p.mkv"),
expectedType = MediaType.Serie
)
),
Named.of(
"Spaces and full words",
ParseVideoTypeTestCase(
file = File("Some Show Season 02 Episode 09 1080p.mkv"),
expectedType = MediaType.Serie
)
),
Named.of(
"1x02 style",
ParseVideoTypeTestCase(
file = File("Show.Name.1x02.HDTV.mp4"),
expectedType = MediaType.Serie
)
),
Named.of(
"Season and episode no separators",
ParseVideoTypeTestCase(
file = File("ShowNameSeason03Episode04.avi"),
expectedType = MediaType.Serie
)
),
Named.of(
"Movie with year and extra tags",
ParseVideoTypeTestCase(
file = File("Some.Movie.Title.1999.720p.BluRay.x264-GROUP.mkv"),
expectedType = MediaType.Movie
)
),
Named.of(
"Confusing underscores and trailers",
ParseVideoTypeTestCase(
file = File("a_movie_trailer_2017_sample.mp4"),
expectedType = MediaType.Movie
)
),
Named.of(
"Mixed separators and version tags",
ParseVideoTypeTestCase(
file = File("Show.Name.S01.E02.720p.HDTV.x264-Group_v2.mkv"),
expectedType = MediaType.Serie
)
),
)
}
}