From d912ee56043c3d30a3c215d7fe02fa46e56ceea4 Mon Sep 17 00:00:00 2001 From: Ben Faerber Date: Tue, 4 Feb 2025 21:28:39 -0700 Subject: [PATCH] [softwhiteunderbelly] Add softwhiteunderbelly.com documentary extractor --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/softwhiteunderbelly.py | 147 ++++++++++++++++++++++++ 2 files changed, 148 insertions(+) create mode 100644 yt_dlp/extractor/softwhiteunderbelly.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c331bab78..21a501e21 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1897,6 +1897,7 @@ SohuIE, SohuVIE, ) +from .softwhiteunderbelly import SoftWhiteUnderbellyIE from .sonyliv import ( SonyLIVIE, SonyLIVSeriesIE, diff --git a/yt_dlp/extractor/softwhiteunderbelly.py b/yt_dlp/extractor/softwhiteunderbelly.py new file mode 100644 index 000000000..6e0c51bef --- /dev/null +++ b/yt_dlp/extractor/softwhiteunderbelly.py @@ -0,0 +1,147 @@ +import functools + +from .common import InfoExtractor +from .vimeo import VHXEmbedIE +from ..utils import ( + ExtractorError, + OnDemandPagedList, + clean_html, + extract_attributes, + get_element_by_class, + get_element_by_id, + get_elements_html_by_class, + int_or_none, + traverse_obj, + unified_strdate, + urlencode_postdata, +) + + +class SoftWhiteUnderbellyIE(InfoExtractor): + _LOGIN_URL = "https://www.softwhiteunderbelly.com/login" + _NETRC_MACHINE = "softwhiteunderbelly" + + _VALID_URL = r"https?://(?:www\.)?softwhiteunderbelly\.com/videos/(?P.+)" + _TESTS = [ + { + "url": "https://www.softwhiteunderbelly.com/videos/kenneth-final1", + "note": "A single Soft White Underbelly Episode", + "md5": "8e79f29ec1f1bda6da2e0b998fcbebb8", + "info_dict": { + "id": "3201266", + "ext": "mp4", + "display_id": "kenneth-final1", + "title": "Appalachian Man interview-Kenneth", + "description": "Soft White Underbelly interview and portrait of Kenneth, an Appalachian man in Clay County, Kentucky.", + "thumbnail": "https://vhx.imgix.net/softwhiteunderbelly/assets/249f6db0-2b39-49a4-979b-f8dad4681825.jpg", + "uploader_url": "https://vimeo.com/user80538407", + "uploader": "OTT Videos", + "uploader_id": "user80538407", + "duration": 512, + }, + "expected_warnings": ["Failed to parse XML: not well-formed"], + }, + { + "url": "https://www.softwhiteunderbelly.com/videos/tj-2-final-2160p", + "note": "A single Soft White Underbelly Episode", + "md5": "286bd8851b4824c62afb369e6f307036", + "info_dict": { + "id": "3506029", + "ext": "mp4", + "display_id": "tj-2-final-2160p", + "title": "Fentanyl Addict interview-TJ (follow up)", + "description": "Soft White Underbelly follow up interview and portrait of TJ, a fentanyl addict on Skid Row.", + "thumbnail": "https://vhx.imgix.net/softwhiteunderbelly/assets/c883d531-5da0-4faf-a2e2-8eba97e5adfc.jpg", + "duration": 817, + "uploader": "OTT Videos", + "uploader_url": "https://vimeo.com/user80538407", + "uploader_id": "user80538407", + }, + "expected_warnings": ["Failed to parse XML: not well-formed"], + }, + ] + + def _get_authenticity_token(self, display_id): + signin_page = self._download_webpage(self._LOGIN_URL, display_id, note="Getting authenticity token") + return self._html_search_regex( + r'name=["\']authenticity_token["\'] value=["\'](.+?)["\']', signin_page, "authenticity_token" + ) + + def _login(self, display_id): + username, password = self._get_login_info() + if not username: + return True + + response = self._download_webpage( + self._LOGIN_URL, + display_id, + note="Logging in", + fatal=False, + data=urlencode_postdata({ + "email": username, + "password": password, + "authenticity_token": self._get_authenticity_token(display_id), + "utf8": True, + }), + ) + + user_has_subscription = self._search_regex( + r'user_has_subscription:\s*["\'](.+?)["\']', response, "subscription status", default="none" + ) + if user_has_subscription.lower() == "true": + return + elif user_has_subscription.lower() == "false": + return "Account is not subscribed" + else: + return "Incorrect username/password" + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = None + if self._get_cookies("https://www.softwhiteunderbelly.com").get("_session"): + webpage = self._download_webpage(url, display_id) + if not webpage or '