import re from .wistia import WistiaBaseIE from ..utils import clean_html, get_elements_html_by_class class ThirtyDaySingerBase(WistiaBaseIE): _INDEX_EXTRACTION_RE = r'/tutorial/[\w-]+/(?P[\w-]+)' def _extract_for_url(self, url): lesson_index = re.search(self._INDEX_EXTRACTION_RE, url).group('index') webpage = self._download_webpage(url, lesson_index) match = next(self._extract_wistia_async_embed(webpage)) embed_config = self._download_embed_config('medias', match.group('id'), url) embed_infojson = self._extract_media(embed_config) webpage_infojson = self._extract_webpage_data(webpage) return {**embed_infojson, **webpage_infojson} def _extract_webpage_data(self, webpage): title = self._html_search_regex(r'

([^<]+)

', webpage, 'title') fallback_title = self._html_extract_title(webpage) description = self._html_search_meta('description', webpage, fatal=False) return { 'title': title or fallback_title, 'description': clean_html(self._format_html_list(description)), } # The site makes extensive use of HTML lists for formatting and `clean_html` # doesn't handle them well. This is needed to keep lists readable. def _format_html_list(self, html): replacements = { '