Add regex to `--match-filter`

author pukkandan <redacted>

Wed, 4 Aug 2021 21:31:23 +0000 (03:01 +0530)

committer pukkandan <redacted>

Wed, 4 Aug 2021 22:40:26 +0000 (04:10 +0530)
author pukkandan <redacted>
Wed, 4 Aug 2021 21:31:23 +0000 (03:01 +0530)
committer pukkandan <redacted>
Wed, 4 Aug 2021 22:40:26 +0000 (04:10 +0530)
diff --git a/README.md b/README.md

index 1967d216cf4337e5690f4ab8b368f656ba5907c9..a308aa196f748a332977aedd0483025a1884d887 100644 (file)
--- a/README.md
+++ b/README.md
@@ -340,19 +340,22 @@ ## Video Selection:
                                       COUNT views
      --match-filter FILTER            Generic video filter. Any field (see
                                       "OUTPUT TEMPLATE") can be compared with a
-                                     number or a quoted string using the
-                                     operators defined in "Filtering formats".
-                                     You can also simply specify a field to
-                                     match if the field is present and "!field"
-                                     to check if the field is not present.
-                                     Multiple filters can be checked using "&".
-                                     For example, to only match videos that are
-                                     not live, has a like count more than 100, a
-                                     dislike count less than 50 (or the dislike
+                                     number or a string using the operators
+                                     defined in "Filtering formats". You can
+                                     also simply specify a field to match if the
+                                     field is present and "!field" to check if
+                                     the field is not present. In addition,
+                                     Python style regular expression matching
+                                     can be done using "~=", and multiple
+                                     filters can be checked with "&". Use a "\"
+                                     to escape "&" or quotes if needed. Eg:
+                                     --match-filter "!is_live & like_count>?100
+                                     & description~=\'(?i)\bcats \& dogs\b\'"
+                                     matches only videos that are not live, has
+                                     a like count more than 100 (or the like
                                       field is not available), and also has a
-                                     description that contains "python", use
-                                     --match-filter "!is_live & like_count>100 &
-                                     dislike_count<?50 & description*='python'"
+                                     description that contains the phrase "cats
+                                     & dogs" (ignoring case)
      --no-match-filter                Do not use generic video filter (default)
      --no-playlist                    Download only the video, if the URL refers
                                       to a video and a playlist
diff --git a/test/test_utils.py b/test/test_utils.py

index 5ac5dedc993193e0074fff82d9da4dc24dfd57bf..aef59e49197ff090744bec2d0618877ccb808942 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1207,11 +1207,26 @@ def test_render_table(self):
              '9999 51')
  
      def test_match_str(self):
+        # Unary
          self.assertFalse(match_str('xy', {'x': 1200}))
          self.assertTrue(match_str('!xy', {'x': 1200}))
          self.assertTrue(match_str('x', {'x': 1200}))
          self.assertFalse(match_str('!x', {'x': 1200}))
          self.assertTrue(match_str('x', {'x': 0}))
+        self.assertTrue(match_str('is_live', {'is_live': True}))
+        self.assertFalse(match_str('is_live', {'is_live': False}))
+        self.assertFalse(match_str('is_live', {'is_live': None}))
+        self.assertFalse(match_str('is_live', {}))
+        self.assertFalse(match_str('!is_live', {'is_live': True}))
+        self.assertTrue(match_str('!is_live', {'is_live': False}))
+        self.assertTrue(match_str('!is_live', {'is_live': None}))
+        self.assertTrue(match_str('!is_live', {}))
+        self.assertTrue(match_str('title', {'title': 'abc'}))
+        self.assertTrue(match_str('title', {'title': ''}))
+        self.assertFalse(match_str('!title', {'title': 'abc'}))
+        self.assertFalse(match_str('!title', {'title': ''}))
+
+        # Numeric
          self.assertFalse(match_str('x>0', {'x': 0}))
          self.assertFalse(match_str('x>0', {}))
          self.assertTrue(match_str('x>?0', {}))
@@ -1219,6 +1234,8 @@ def test_match_str(self):
          self.assertFalse(match_str('x>2K', {'x': 1200}))
          self.assertTrue(match_str('x>=1200 & x < 1300', {'x': 1200}))
          self.assertFalse(match_str('x>=1100 & x < 1200', {'x': 1200}))
+
+        # String
          self.assertFalse(match_str('y=a212', {'y': 'foobar42'}))
          self.assertTrue(match_str('y=foobar42', {'y': 'foobar42'}))
          self.assertFalse(match_str('y!=foobar42', {'y': 'foobar42'}))
@@ -1234,6 +1251,8 @@ def test_match_str(self):
          self.assertTrue(match_str('y!*=baz', {'y': 'foobar42'}))
          self.assertTrue(match_str('y$=42', {'y': 'foobar42'}))
          self.assertFalse(match_str('y$=43', {'y': 'foobar42'}))
+
+        # And
          self.assertFalse(match_str(
              'like_count > 100 & dislike_count <? 50 & description',
              {'like_count': 90, 'description': 'foo'}))
@@ -1246,18 +1265,29 @@ def test_match_str(self):
          self.assertFalse(match_str(
              'like_count > 100 & dislike_count <? 50 & description',
              {'like_count': 190, 'dislike_count': 10}))
-        self.assertTrue(match_str('is_live', {'is_live': True}))
-        self.assertFalse(match_str('is_live', {'is_live': False}))
-        self.assertFalse(match_str('is_live', {'is_live': None}))
-        self.assertFalse(match_str('is_live', {}))
-        self.assertFalse(match_str('!is_live', {'is_live': True}))
-        self.assertTrue(match_str('!is_live', {'is_live': False}))
-        self.assertTrue(match_str('!is_live', {'is_live': None}))
-        self.assertTrue(match_str('!is_live', {}))
-        self.assertTrue(match_str('title', {'title': 'abc'}))
-        self.assertTrue(match_str('title', {'title': ''}))
-        self.assertFalse(match_str('!title', {'title': 'abc'}))
-        self.assertFalse(match_str('!title', {'title': ''}))
+
+        # Regex
+        self.assertTrue(match_str(r'x~=\bbar', {'x': 'foo bar'}))
+        self.assertFalse(match_str(r'x~=\bbar.+', {'x': 'foo bar'}))
+        self.assertFalse(match_str(r'x~=^FOO', {'x': 'foo bar'}))
+        self.assertTrue(match_str(r'x~=(?i)^FOO', {'x': 'foo bar'}))
+
+        # Quotes
+        self.assertTrue(match_str(r'x^="foo"', {'x': 'foo "bar"'}))
+        self.assertFalse(match_str(r'x^="foo  "', {'x': 'foo "bar"'}))
+        self.assertFalse(match_str(r'x$="bar"', {'x': 'foo "bar"'}))
+        self.assertTrue(match_str(r'x$=" \"bar\""', {'x': 'foo "bar"'}))
+
+        # Escaping &
+        self.assertFalse(match_str(r'x=foo & bar', {'x': 'foo & bar'}))
+        self.assertTrue(match_str(r'x=foo \& bar', {'x': 'foo & bar'}))
+        self.assertTrue(match_str(r'x=foo \& bar & x^=foo', {'x': 'foo & bar'}))
+        self.assertTrue(match_str(r'x="foo \& bar" & x^=foo', {'x': 'foo & bar'}))
+
+        # Example from docs
+        self.assertTrue(
+            r'!is_live & like_count>?100 & description~=\'(?i)\bcats \& dogs\b\'',
+            {'description': 'Raining Cats & Dogs'})
  
      def test_parse_dfxp_time_expr(self):
          self.assertEqual(parse_dfxp_time_expr(None), None)
diff --git a/yt_dlp/options.py b/yt_dlp/options.py

index fba23138292253a80e0ab27221085204325366e6..b5ddbeaff7db5f253001661ce32aa06ef5f2c2e2 100644 (file)
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@@ -378,13 +378,14 @@ def _dict_from_options_callback(
              'Generic video filter. Any field (see "OUTPUT TEMPLATE") can be compared with a '
              'number or a string using the operators defined in "Filtering formats". '
              'You can also simply specify a field to match if the field is present '
-            'and "!field" to check if the field is not present. '
-            'Multiple filters can be checked using "&". '
-            'For example, to only match videos that are not live, '
-            'has a like count more than 100, a dislike count less than 50 '
-            '(or the dislike field is not available), and also has a description '
-            'that contains "python", use --match-filter "!is_live & '
-            'like_count>100 & dislike_count<?50 & description*=\'python\'"'))
+            'and "!field" to check if the field is not present. In addition, '
+            'Python style regular expression matching can be done using "~=", '
+            'and multiple filters can be checked with "&". '
+            'Use a "\\" to escape "&" or quotes if needed. Eg: --match-filter '
+            r'"!is_live & like_count>?100 & description~=\'(?i)\bcats \& dogs\b\'" '
+            'matches only videos that are not live, has a like count more than 100 '
+            '(or the like field is not available), and also has a description '
+            'that contains the phrase "cats & dogs" (ignoring case)'))
      selection.add_option(
          '--no-match-filter',
          metavar='FILTER', dest='match_filter', action='store_const', const=None,
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py

index d06b18e005d45336d5ba5d8e2e609240d3993d35..b04fbd22cf526e4ea672b04767ae59c8cf657589 100644 (file)
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -4664,23 +4664,28 @@ def filter_using_list(row, filterArray):
  
  def _match_one(filter_part, dct):
      # TODO: Generalize code with YoutubeDL._build_format_filter
+    STRING_OPERATORS = {
+        '*=': operator.contains,
+        '^=': lambda attr, value: attr.startswith(value),
+        '$=': lambda attr, value: attr.endswith(value),
+        '~=': lambda attr, value: re.search(value, attr),
+    }
      COMPARISON_OPERATORS = {
+        **STRING_OPERATORS,
+        '<=': operator.le,  # "<=" must be defined above "<"
          '<': operator.lt,
-        '<=': operator.le,
-        '>': operator.gt,
          '>=': operator.ge,
+        '>': operator.gt,
          '=': operator.eq,
-        '*=': operator.contains,
-        '^=': lambda attr, value: attr.startswith(value),
-        '$=': lambda attr, value: attr.endswith(value),
      }
+
      operator_rex = re.compile(r'''(?x)\s*
          (?P<key>[a-z_]+)
          \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
          (?:
              (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
-            (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
-            (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
+            (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
+            (?P<strval>.+?)
          )
          \s*$
          ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
@@ -4705,9 +4710,8 @@ def _match_one(filter_part, dct):
              if quote is not None:
                  comparison_value = comparison_value.replace(r'\%s' % quote, quote)
          else:
-            if m.group('op') in ('*=', '^=', '$='):
-                raise ValueError(
-                    'Operator %s only supports string values!' % m.group('op'))
+            if m.group('op') in STRING_OPERATORS:
+                raise ValueError('Operator %s only supports string values!' % m.group('op'))
              try:
                  comparison_value = int(m.group('intval'))
              except ValueError:
@@ -4743,7 +4747,8 @@ def match_str(filter_str, dct):
      """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
  
      return all(
-        _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
+        _match_one(filter_part.replace(r'\&', '&'), dct)
+        for filter_part in re.split(r'(?<!\\)&', filter_str))
  
  
  def match_filter_func(filter_str):
author	pukkandan <redacted>
	Wed, 4 Aug 2021 21:31:23 +0000 (03:01 +0530)
committer	pukkandan <redacted>
	Wed, 4 Aug 2021 22:40:26 +0000 (04:10 +0530)
README.md		patch \| blob \| blame \| history
test/test_utils.py		patch \| blob \| blame \| history
yt_dlp/options.py		patch \| blob \| blame \| history
yt_dlp/utils.py		patch \| blob \| blame \| history