From 524e2e4fda4d0deb135398ef85752be522b507e7 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 26 Sep 2021 01:39:44 +0530 Subject: [PATCH] [outtmpl] Format type `U` for unicode normalization --- README.md | 1 + test/test_YoutubeDL.py | 6 +++++- yt_dlp/YoutubeDL.py | 26 ++++++++++++++++---------- yt_dlp/utils.py | 10 +++++----- 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index a14880282..d13eb4dc1 100644 --- a/README.md +++ b/README.md @@ -964,6 +964,7 @@ The field names themselves (the part inside the parenthesis) can also have some 1. **Alternatives**: Alternate fields can be specified seperated with a `,`. Eg: `%(release_date>%Y,upload_date>%Y|Unknown)s` 1. **Default**: A literal default value can be specified for when the field is empty using a `|` seperator. This overrides `--output-na-template`. Eg: `%(uploader|Unknown)s` 1. **More Conversions**: In addition to the normal format types `diouxXeEfFgGcrs`, `B`, `j`, `l`, `q` can be used for converting to **B**ytes, **j**son, a comma seperated **l**ist and a string **q**uoted for the terminal respectively +1. **Unicode normalization**: The format type `U` can be used for NFC [unicode normalization](https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize). The alternate form flag (`#`) changes the normalization to NFD and the conversion flag `+` can be used for NFKC/NFKD compatibility equivalence normalization. Eg: `%(title)+.100U` is NFKC To summarize, the general syntax for a field is: ``` diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 6feca2ce2..f6483575f 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -649,7 +649,7 @@ class TestYoutubeDL(unittest.TestCase): 'title2': '%PATH%', 'title3': 'foo/bar\\test', 'title4': 'foo "bar" test', - 'title5': 'áéí', + 'title5': 'áéí 𝐀', 'timestamp': 1618488000, 'duration': 100000, 'playlist_index': 1, @@ -769,6 +769,10 @@ class TestYoutubeDL(unittest.TestCase): test('%(formats.:.id) 15l', ' id1, id2, id3') test('%(formats)j', (json.dumps(FORMATS), sanitize(json.dumps(FORMATS)))) test('%(title5).3B', 'á') + test('%(title5)U', 'áéí 𝐀') + test('%(title5)#U', 'a\u0301e\u0301i\u0301 𝐀') + test('%(title5)+U', 'áéí A') + test('%(title5)+#U', 'a\u0301e\u0301i\u0301 A') if compat_os_name == 'nt': test('%(title4)q', ('"foo \\"bar\\" test"', "'foo _'bar_' test'")) else: diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 11371fa86..a6eddd7f7 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -25,6 +25,7 @@ import time import tokenize import traceback import random +import unicodedata from string import ascii_letters @@ -908,7 +909,7 @@ class YoutubeDL(object): def validate_outtmpl(cls, outtmpl): ''' @return None or Exception object ''' outtmpl = re.sub( - STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqB]'), + STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBU]'), lambda mobj: f'{mobj.group(0)[:-1]}s', cls._outtmpl_expandpath(outtmpl)) try: @@ -940,7 +941,7 @@ class YoutubeDL(object): } TMPL_DICT = {} - EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqB]')) + EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBU]')) MATH_FUNCTIONS = { '+': float.__add__, '-': float.__sub__, @@ -1031,21 +1032,26 @@ class YoutubeDL(object): value = default if value is None else value str_fmt = f'{fmt[:-1]}s' - if fmt[-1] == 'l': + if fmt[-1] == 'l': # list value, fmt = ', '.join(variadic(value)), str_fmt - elif fmt[-1] == 'j': + elif fmt[-1] == 'j': # json value, fmt = json.dumps(value, default=_dumpjson_default), str_fmt - elif fmt[-1] == 'q': + elif fmt[-1] == 'q': # quoted value, fmt = compat_shlex_quote(str(value)), str_fmt - elif fmt[-1] == 'B': + elif fmt[-1] == 'B': # bytes value = f'%{str_fmt}'.encode('utf-8') % str(value).encode('utf-8') value, fmt = value.decode('utf-8', 'ignore'), 's' + elif fmt[-1] == 'U': # unicode normalized + opts = outer_mobj.group('conversion') or '' + value, fmt = unicodedata.normalize( + # "+" = compatibility equivalence, "#" = NFD + 'NF%s%s' % ('K' if '+' in opts else '', 'D' if '#' in opts else 'C'), + value), str_fmt elif fmt[-1] == 'c': - value = str(value) - if value is None: - value, fmt = default, 's' + if value: + value = str(value)[0] else: - value = value[0] + fmt = str_fmt elif fmt[-1] not in 'rs': # numeric value = float_or_none(value) if value is None: diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 141d2c9cc..770d7feb9 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -4474,12 +4474,12 @@ OUTTMPL_TYPES = { STR_FORMAT_RE_TMPL = r'''(?x) (?(?:%%)*) % - (?P\((?P{0})\))? # mapping key + (?P\((?P{0})\))? (?P - (?:[#0\-+ ]+)? # conversion flags (optional) - (?:\d+)? # minimum field width (optional) - (?:\.\d+)? # precision (optional) - [hlL]? # length modifier (optional) + (?P[#0\-+ ]+)? + (?P\d+)? + (?P\.\d+)? + (?P[hlL])? # unused in python {1} # conversion type ) '''