From 3b0091a2600689f2edca5ea10fd662ac9fec1922 Mon Sep 17 00:00:00 2001
From: Fries <fries1234@protonmail.com>
Date: Sun, 2 Mar 2025 16:14:12 -0800
Subject: [PATCH] [ie/Tvw] Apply suggestions from PR

---
 yt_dlp/extractor/tvw.py | 93 ++++++++++++++++++++++++-----------------
 1 file changed, 55 insertions(+), 38 deletions(-)
diff --git a/yt_dlp/extractor/tvw.py b/yt_dlp/extractor/tvw.py
index dd5f6eeb6..85a75a81c 100644
--- a/yt_dlp/extractor/tvw.py
+++ b/yt_dlp/extractor/tvw.py
@@ -1,11 +1,12 @@
 import json
 
 from .common import InfoExtractor
-from ..utils import ExtractorError, clean_html, traverse_obj, unified_timestamp, url_or_none
+from ..utils import clean_html, remove_end, traverse_obj, unified_timestamp, url_or_none
 
 
 class TvwIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?tvw\.org/video/(?P<id>[^/?#]+)'
+    _VALID_THUMBNAIL_URLS = r're:^https?://.*\.(?:jpe?g|png)$'
     _TESTS = [{
         'url': 'https://tvw.org/video/billy-frank-jr-statue-maquette-unveiling-ceremony-2024011211/',
         'md5': '9ceb94fe2bb7fd726f74f16356825703',
@@ -13,11 +14,13 @@ class TvwIE(InfoExtractor):
             'id': '2024011211',
             'ext': 'mp4',
             'title': 'Billy Frank Jr. Statue Maquette Unveiling Ceremony',
-            'thumbnail': r're:^https?://.*\.jpg$',
+            'thumbnail': _VALID_THUMBNAIL_URLS,
             'description': 'md5:58a8150017d985b4f377e11ee8f6f36e',
             'timestamp': 1704902400,
             'upload_date': '20240110',
             'location': 'Legislative Building',
+            'display_id': 'billy-frank-jr-statue-maquette-unveiling-ceremony-2024011211',
+            'categories': ['General Interest'],
         }}, {
         'url': 'https://tvw.org/video/ebeys-landing-state-park-2024081007/',
         'md5': '71e87dae3deafd65d75ff3137b9a32fc',
@@ -25,11 +28,13 @@ class TvwIE(InfoExtractor):
             'id': '2024081007',
             'ext': 'mp4',
             'title': 'Ebey\'s Landing State Park',
-            'thumbnail': r're:^https?://.*\.jpg$',
+            'thumbnail': _VALID_THUMBNAIL_URLS,
             'description': 'md5:50c5bd73bde32fa6286a008dbc853386',
             'timestamp': 1724310900,
             'upload_date': '20240822',
             'location': 'Ebey’s Landing State Park',
+            'display_id': 'ebeys-landing-state-park-2024081007',
+            'categories': ['Washington State Parks'],
         }}, {
         'url': 'https://tvw.org/video/home-warranties-workgroup-2',
         'md5': 'f678789bf94d07da89809f213cf37150',
@@ -37,57 +42,69 @@ class TvwIE(InfoExtractor):
             'id': '1999121000',
             'ext': 'mp4',
             'title': 'Home Warranties Workgroup',
-            'thumbnail': r're:^https?://.*\.jpg$',
+            'thumbnail': _VALID_THUMBNAIL_URLS,
             'description': 'md5:861396cc523c9641d0dce690bc5c35f3',
             'timestamp': 946389600,
             'upload_date': '19991228',
+            'display_id': 'home-warranties-workgroup-2',
+            'categories': ['Legislative'],
+        }}, {
+        'url': 'https://tvw.org/video/washington-to-washington-a-new-space-race-2022041111/?eventID=2022041111',
+        'md5': '6f5551090b351aba10c0d08a881b4f30',
+        'info_dict': {
+            'id': '2022041111',
+            'ext': 'mp4',
+            'title': 'Washington to Washington - A New Space Race',
+            'thumbnail': _VALID_THUMBNAIL_URLS,
+            'description': 'md5:f65a24eec56107afbcebb3aa5cd26341',
+            'timestamp': 1650394800,
+            'upload_date': '20220419',
+            'location': 'Hayner Media Center',
+            'display_id': 'washington-to-washington-a-new-space-race-2022041111',
+            'categories': ['Washington to Washington', 'General Interest'],
         }}]
 
-    def _extract_formats(self, response, video_id):
-        extract_formats = lambda url, video_id: self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
-        stream_urls = traverse_obj(response, 'streamingURIs', {
-            'main': ('main', {url_or_none}),
-            'backup': ('backup', {url_or_none}),
-        })
-
-        try:
-            return extract_formats(stream_urls.get('main'), video_id)
-        except ExtractorError:
-            self.report_warning('Failed to parse the m3u8 stream. Falling back to the backup stream if it exists.')
-            try:
-                return extract_formats(stream_urls.get('backup'), video_id)
-            except ExtractorError:
-                raise
-
     def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
 
         client_id = self._html_search_meta('clientID', webpage, fatal=True)
         video_id = self._html_search_meta('eventID', webpage, fatal=True)
 
-        video_data = self._download_json('https://api.v3.invintus.com/v2/Event/getDetailed', video_id,
-                                         headers={
-                                             'authorization': 'embedder',
-                                             'wsc-api-key': '7WhiEBzijpritypp8bqcU7pfU9uicDR',
-                                         },
-                                         data=json.dumps({
-                                             'clientID': client_id,
-                                             'eventID': video_id,
-                                             'showStreams': True,
-                                         }).encode()).get('data')
+        video_data = self._download_json(
+            'https://api.v3.invintus.com/v2/Event/getDetailed', video_id,
+            headers={
+                'authorization': 'embedder',
+                'wsc-api-key': '7WhiEBzijpritypp8bqcU7pfU9uicDR',
+            },
+            data=json.dumps({
+                'clientID': client_id,
+                'eventID': video_id,
+                'showStreams': True,
+            }).encode())['data']
 
-        formats, subtitles = self._extract_formats(video_data, video_id)
+        formats = []
+        subtitles = {}
+
+        for stream_url in traverse_obj(video_data, ('streamingURIs', ..., {url_or_none})):
+            fmts, subs = self._extract_m3u8_formats_and_subtitles(
+                stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
+            formats.extend(fmts)
+            self._merge_subtitles(subs, target=subtitles)
+        if caption_url := traverse_obj(video_data, ('captionPath', {url_or_none})):
+            self._merge_subtitles({'en': [{'en': 'vtt', 'url': caption_url}]}, target=subtitles)
 
         return {
             'id': video_id,
+            'display_id': display_id,
             'formats': formats,
+            'subtitles': subtitles,
+            'title': remove_end(self._og_search_title(webpage), ' - TVW'),
+            'description': self._og_search_description(webpage),
             **traverse_obj(video_data, {
-                'title': ('title', {lambda x: x or self._og_search_title(webpage)}),
-                'description': ('description', {lambda x: clean_html(x) or self._og_search_description(webpage)}),
-                'subtitles': ('captionPath', {
-                    lambda x: self._merge_subtitles({'en': [{'ext': 'vtt', 'url': x}]}, target=subtitles),
-                }),
+                'title': ('title', {str}),
+                'description': ('description', {clean_html}),
+                'categories': ('categories', ..., {str}),
                 'thumbnail': ('videoThumbnail', {url_or_none}),
                 'timestamp': ('startDateTime', {unified_timestamp}),
                 'location': ('locationName', {str}),