Webscout-API

Sleeping

App Files Files Community

Abhaykoul commited on Jun 23, 2024

Commit

95fe448

verified ·

1 Parent(s): 4b0e179

Update webscout.py

Browse files

Files changed (1) hide show

webscout.py +572 -9

webscout.py CHANGED Viewed

@@ -20,14 +20,80 @@ try:
 except ImportError:
     LXML_AVAILABLE = False
-from .utils import (
-    _calculate_distance,
-    _extract_vqd,
-    _normalize,
-    _normalize_url,
-    _text_extract_json,
-    json_loads,
-)
 logger = logging.getLogger("webscout.WEBS")
@@ -1078,4 +1144,501 @@ class WEBS:
         except Exception as e:
             raise e
-        return results

 except ImportError:
     LXML_AVAILABLE = False
+import re
+from decimal import Decimal
+from html import unescape
+from math import atan2, cos, radians, sin, sqrt
+from typing import Any, Dict, List, Union
+from urllib.parse import unquote
+import orjson
+from .exceptions import WebscoutE
+REGEX_STRIP_TAGS = re.compile("<.*?>")
+def json_dumps(obj: Any) -> str:
+    try:
+        return orjson.dumps(obj).decode("utf-8")
+    except Exception as ex:
+        raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
+def json_loads(obj: Union[str, bytes]) -> Any:
+    try:
+        return orjson.loads(obj)
+    except Exception as ex:
+        raise WebscoutE(f"{type(ex).__name__}: {ex}") from ex
+def _extract_vqd(html_bytes: bytes, keywords: str) -> str:
+    """Extract vqd from html bytes."""
+    for c1, c1_len, c2 in (
+        (b'vqd="', 5, b'"'),
+        (b"vqd=", 4, b"&"),
+        (b"vqd='", 5, b"'"),
+    ):
+        try:
+            start = html_bytes.index(c1) + c1_len
+            end = html_bytes.index(c2, start)
+            return html_bytes[start:end].decode()
+        except ValueError:
+            pass
+    raise WebscoutE(f"_extract_vqd() {keywords=} Could not extract vqd.")
+def _text_extract_json(html_bytes: bytes, keywords: str) -> List[Dict[str, str]]:
+    """text(backend="api") -> extract json from html."""
+    try:
+        start = html_bytes.index(b"DDG.pageLayout.load('d',") + 24
+        end = html_bytes.index(b");DDG.duckbar.load(", start)
+        data = html_bytes[start:end]
+        result: List[Dict[str, str]] = json_loads(data)
+        return result
+    except Exception as ex:
+        raise WebscoutE(f"_text_extract_json() {keywords=} {type(ex).__name__}: {ex}") from ex
+    raise WebscoutE(f"_text_extract_json() {keywords=} return None")
+def _normalize(raw_html: str) -> str:
+    """Strip HTML tags from the raw_html string."""
+    return unescape(REGEX_STRIP_TAGS.sub("", raw_html)) if raw_html else ""
+def _normalize_url(url: str) -> str:
+    """Unquote URL and replace spaces with '+'."""
+    return unquote(url.replace(" ", "+")) if url else ""
+def _calculate_distance(lat1: Decimal, lon1: Decimal, lat2: Decimal, lon2: Decimal) -> float:
+    """Calculate distance between two points in km. Haversine formula."""
+    R = 6371.0087714  # Earth's radius in km
+    rlat1, rlon1, rlat2, rlon2 = map(radians, [float(lat1), float(lon1), float(lat2), float(lon2)])
+    dlon, dlat = rlon2 - rlon1, rlat2 - rlat1
+    a = sin(dlat / 2) ** 2 + cos(rlat1) * cos(rlat2) * sin(dlon / 2) ** 2
+    c = 2 * atan2(sqrt(a), sqrt(1 - a))
+    return R * c
 logger = logging.getLogger("webscout.WEBS")
         except Exception as e:
             raise e
+        return results
+import requests
+import http.cookiejar as cookiejar
+import sys
+import json
+from xml.etree import ElementTree
+import re
+from requests import HTTPError
+import html.parser
+html_parser = html.parser.HTMLParser()
+import html
+def unescape(string):
+    return html.unescape(string)
+WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
+class TranscriptRetrievalError(Exception):
+    """
+    Base class for exceptions raised when a transcript cannot be retrieved.
+    """
+    ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!'
+    CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}'
+    CAUSE_MESSAGE = ''
+    GITHUB_REFERRAL = (
+        '\n\nIf you are sure that the described cause is not responsible for this error '
+        'and that a transcript should be retrievable, please create an issue at '
+        'https://github.com/OE-LUCIFER/Webscout/issues. '
+        'Please add which version of webscout you are using '
+        'and provide the information needed to replicate the error. '
+    )
+    def __init__(self, video_id):
+        self.video_id = video_id
+        super(TranscriptRetrievalError, self).__init__(self._build_error_message())
+    def _build_error_message(self):
+        cause = self.cause
+        error_message = self.ERROR_MESSAGE.format(video_url=WATCH_URL.format(video_id=self.video_id))
+        if cause:
+            error_message += self.CAUSE_MESSAGE_INTRO.format(cause=cause) + self.GITHUB_REFERRAL
+        return error_message
+    @property
+    def cause(self):
+        return self.CAUSE_MESSAGE
+class YouTubeRequestFailedError(TranscriptRetrievalError):
+    CAUSE_MESSAGE = 'Request to YouTube failed: {reason}'
+    def __init__(self, video_id, http_error):
+        self.reason = str(http_error)
+        super(YouTubeRequestFailedError, self).__init__(video_id)
+    @property
+    def cause(self):
+        return self.CAUSE_MESSAGE.format(reason=self.reason)
+class VideoUnavailableError(TranscriptRetrievalError):
+    CAUSE_MESSAGE = 'The video is no longer available'
+class InvalidVideoIdError(TranscriptRetrievalError):
+    CAUSE_MESSAGE = (
+        'You provided an invalid video id. Make sure you are using the video id and NOT the url!\n\n'
+        'Do NOT run: `YouTubeTranscriptApi.get_transcript("https://www.youtube.com/watch?v=1234")`\n'
+        'Instead run: `YouTubeTranscriptApi.get_transcript("1234")`'
+    )
+class TooManyRequestsError(TranscriptRetrievalError):
+    CAUSE_MESSAGE = (
+        'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. '
+        'One of the following things can be done to work around this:\n\
+        - Manually solve the captcha in a browser and export the cookie. '
+        'Read here how to use that cookie with '
+        'youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\
+        - Use a different IP address\n\
+        - Wait until the ban on your IP has been lifted'
+    )
+class TranscriptsDisabledError(TranscriptRetrievalError):
+    CAUSE_MESSAGE = 'Subtitles are disabled for this video'
+class NoTranscriptAvailableError(TranscriptRetrievalError):
+    CAUSE_MESSAGE = 'No transcripts are available for this video'
+class NotTranslatableError(TranscriptRetrievalError):
+    CAUSE_MESSAGE = 'The requested language is not translatable'
+class TranslationLanguageNotAvailableError(TranscriptRetrievalError):
+    CAUSE_MESSAGE = 'The requested translation language is not available'
+class CookiePathInvalidError(TranscriptRetrievalError):
+    CAUSE_MESSAGE = 'The provided cookie file was unable to be loaded'
+class CookiesInvalidError(TranscriptRetrievalError):
+    CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)'
+class FailedToCreateConsentCookieError(TranscriptRetrievalError):
+    CAUSE_MESSAGE = 'Failed to automatically give consent to saving cookies'
+class NoTranscriptFoundError(TranscriptRetrievalError):
+    CAUSE_MESSAGE = (
+        'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
+        '{transcript_data}'
+    )
+    def __init__(self, video_id, requested_language_codes, transcript_data):
+        self._requested_language_codes = requested_language_codes
+        self._transcript_data = transcript_data
+        super(NoTranscriptFoundError, self).__init__(video_id)
+    @property
+    def cause(self):
+        return self.CAUSE_MESSAGE.format(
+            requested_language_codes=self._requested_language_codes,
+            transcript_data=str(self._transcript_data),
+        )
+def _raise_http_errors(response, video_id):
+    try:
+        response.raise_for_status()
+        return response
+    except HTTPError as error:
+        raise YouTubeRequestFailedError(error, video_id)
+class TranscriptListFetcher(object):
+    def __init__(self, http_client):
+        self._http_client = http_client
+    def fetch(self, video_id):
+        return TranscriptList.build(
+            self._http_client,
+            video_id,
+            self._extract_captions_json(self._fetch_video_html(video_id), video_id),
+        )
+    def _extract_captions_json(self, html, video_id):
+        splitted_html = html.split('"captions":')
+        if len(splitted_html) <= 1:
+            if video_id.startswith('http://') or video_id.startswith('https://'):
+                raise InvalidVideoIdError(video_id)
+            if 'class="g-recaptcha"' in html:
+                raise TooManyRequestsError(video_id)
+            if '"playabilityStatus":' not in html:
+                raise VideoUnavailableError(video_id)
+            raise TranscriptsDisabledError(video_id)
+        captions_json = json.loads(
+            splitted_html[1].split(',"videoDetails')[0].replace('\n', '')
+        ).get('playerCaptionsTracklistRenderer')
+        if captions_json is None:
+            raise TranscriptsDisabledError(video_id)
+        if 'captionTracks' not in captions_json:
+            raise TranscriptsDisabledError(video_id)
+        return captions_json
+    def _create_consent_cookie(self, html, video_id):
+        match = re.search('name="v" value="(.*?)"', html)
+        if match is None:
+            raise FailedToCreateConsentCookieError(video_id)
+        self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
+    def _fetch_video_html(self, video_id):
+        html = self._fetch_html(video_id)
+        if 'action="https://consent.youtube.com/s"' in html:
+            self._create_consent_cookie(html, video_id)
+            html = self._fetch_html(video_id)
+            if 'action="https://consent.youtube.com/s"' in html:
+                raise FailedToCreateConsentCookieError(video_id)
+        return html
+    def _fetch_html(self, video_id):
+        response = self._http_client.get(WATCH_URL.format(video_id=video_id), headers={'Accept-Language': 'en-US'})
+        return unescape(_raise_http_errors(response, video_id).text)
+class TranscriptList(object):
+    """
+    This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
+    for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
+    """
+    def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
+        """
+        The constructor is only for internal use. Use the static build method instead.
+        :param video_id: the id of the video this TranscriptList is for
+        :type video_id: str
+        :param manually_created_transcripts: dict mapping language codes to the manually created transcripts
+        :type manually_created_transcripts: dict[str, Transcript]
+        :param generated_transcripts: dict mapping language codes to the generated transcripts
+        :type generated_transcripts: dict[str, Transcript]
+        :param translation_languages: list of languages which can be used for translatable languages
+        :type translation_languages: list[dict[str, str]]
+        """
+        self.video_id = video_id
+        self._manually_created_transcripts = manually_created_transcripts
+        self._generated_transcripts = generated_transcripts
+        self._translation_languages = translation_languages
+    @staticmethod
+    def build(http_client, video_id, captions_json):
+        """
+        Factory method for TranscriptList.
+        :param http_client: http client which is used to make the transcript retrieving http calls
+        :type http_client: requests.Session
+        :param video_id: the id of the video this TranscriptList is for
+        :type video_id: str
+        :param captions_json: the JSON parsed from the YouTube pages static HTML
+        :type captions_json: dict
+        :return: the created TranscriptList
+        :rtype TranscriptList:
+        """
+        translation_languages = [
+            {
+                'language': translation_language['languageName']['simpleText'],
+                'language_code': translation_language['languageCode'],
+            } for translation_language in captions_json.get('translationLanguages', [])
+        ]
+        manually_created_transcripts = {}
+        generated_transcripts = {}
+        for caption in captions_json['captionTracks']:
+            if caption.get('kind', '') == 'asr':
+                transcript_dict = generated_transcripts
+            else:
+                transcript_dict = manually_created_transcripts
+            transcript_dict[caption['languageCode']] = Transcript(
+                http_client,
+                video_id,
+                caption['baseUrl'],
+                caption['name']['simpleText'],
+                caption['languageCode'],
+                caption.get('kind', '') == 'asr',
+                translation_languages if caption.get('isTranslatable', False) else [],
+            )
+        return TranscriptList(
+            video_id,
+            manually_created_transcripts,
+            generated_transcripts,
+            translation_languages,
+        )
+    def __iter__(self):
+        return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values()))
+    def find_transcript(self, language_codes):
+        """
+        Finds a transcript for a given language code. Manually created transcripts are returned first and only if none
+        are found, generated transcripts are used. If you only want generated transcripts use
+        `find_manually_created_transcript` instead.
+        :param language_codes: A list of language codes in a descending priority. For example, if this is set to
+        ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
+        it fails to do so.
+        :type languages: list[str]
+        :return: the found Transcript
+        :rtype Transcript:
+        :raises: NoTranscriptFound
+        """
+        return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
+    def find_generated_transcript(self, language_codes):
+        """
+        Finds an automatically generated transcript for a given language code.
+        :param language_codes: A list of language codes in a descending priority. For example, if this is set to
+        ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
+        it fails to do so.
+        :type languages: list[str]
+        :return: the found Transcript
+        :rtype Transcript:
+        :raises: NoTranscriptFound
+        """
+        return self._find_transcript(language_codes, [self._generated_transcripts])
+    def find_manually_created_transcript(self, language_codes):
+        """
+        Finds a manually created transcript for a given language code.
+        :param language_codes: A list of language codes in a descending priority. For example, if this is set to
+        ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
+        it fails to do so.
+        :type languages: list[str]
+        :return: the found Transcript
+        :rtype Transcript:
+        :raises: NoTranscriptFound
+        """
+        return self._find_transcript(language_codes, [self._manually_created_transcripts])
+    def _find_transcript(self, language_codes, transcript_dicts):
+        for language_code in language_codes:
+            for transcript_dict in transcript_dicts:
+                if language_code in transcript_dict:
+                    return transcript_dict[language_code]
+        raise NoTranscriptFoundError(
+            self.video_id,
+            language_codes,
+            self
+        )
+    def __str__(self):
+        return (
+            'For this video ({video_id}) transcripts are available in the following languages:\n\n'
+            '(MANUALLY CREATED)\n'
+            '{available_manually_created_transcript_languages}\n\n'
+            '(GENERATED)\n'
+            '{available_generated_transcripts}\n\n'
+            '(TRANSLATION LANGUAGES)\n'
+            '{available_translation_languages}'
+        ).format(
+            video_id=self.video_id,
+            available_manually_created_transcript_languages=self._get_language_description(
+                str(transcript) for transcript in self._manually_created_transcripts.values()
+            ),
+            available_generated_transcripts=self._get_language_description(
+                str(transcript) for transcript in self._generated_transcripts.values()
+            ),
+            available_translation_languages=self._get_language_description(
+                '{language_code} ("{language}")'.format(
+                    language=translation_language['language'],
+                    language_code=translation_language['language_code'],
+                ) for translation_language in self._translation_languages
+            )
+        )
+    def _get_language_description(self, transcript_strings):
+        description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings)
+        return description if description else 'None'
+class Transcript(object):
+    def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
+        """
+        You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
+        TranscriptList.
+        :param http_client: http client which is used to make the transcript retrieving http calls
+        :type http_client: requests.Session
+        :param video_id: the id of the video this TranscriptList is for
+        :type video_id: str
+        :param url: the url which needs to be called to fetch the transcript
+        :param language: the name of the language this transcript uses
+        :param language_code:
+        :param is_generated:
+        :param translation_languages:
+        """
+        self._http_client = http_client
+        self.video_id = video_id
+        self._url = url
+        self.language = language
+        self.language_code = language_code
+        self.is_generated = is_generated
+        self.translation_languages = translation_languages
+        self._translation_languages_dict = {
+            translation_language['language_code']: translation_language['language']
+            for translation_language in translation_languages
+        }
+    def fetch(self, preserve_formatting=False):
+        """
+        Loads the actual transcript data.
+        :param preserve_formatting: whether to keep select HTML text formatting
+        :type preserve_formatting: bool
+        :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
+        :rtype [{'text': str, 'start': float, 'end': float}]:
+        """
+        response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'})
+        return _TranscriptParser(preserve_formatting=preserve_formatting).parse(
+            _raise_http_errors(response, self.video_id).text,
+        )
+    def __str__(self):
+        return '{language_code} ("{language}"){translation_description}'.format(
+            language=self.language,
+            language_code=self.language_code,
+            translation_description='[TRANSLATABLE]' if self.is_translatable else ''
+        )
+    @property
+    def is_translatable(self):
+        return len(self.translation_languages) > 0
+    def translate(self, language_code):
+        if not self.is_translatable:
+            raise NotTranslatableError(self.video_id)
+        if language_code not in self._translation_languages_dict:
+            raise TranslationLanguageNotAvailableError(self.video_id)
+        return Transcript(
+            self._http_client,
+            self.video_id,
+            '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code),
+            self._translation_languages_dict[language_code],
+            language_code,
+            True,
+            [],
+        )
+class _TranscriptParser(object):
+    _FORMATTING_TAGS = [
+        'strong',  # important
+        'em',  # emphasized
+        'b',  # bold
+        'i',  # italic
+        'mark',  # marked
+        'small',  # smaller
+        'del',  # deleted
+        'ins',  # inserted
+        'sub',  # subscript
+        'sup',  # superscript
+    ]
+    def __init__(self, preserve_formatting=False):
+        self._html_regex = self._get_html_regex(preserve_formatting)
+    def _get_html_regex(self, preserve_formatting):
+        if preserve_formatting:
+            formats_regex = '|'.join(self._FORMATTING_TAGS)
+            formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
+            html_regex = re.compile(formats_regex, re.IGNORECASE)
+        else:
+            html_regex = re.compile(r'<[^>]*>', re.IGNORECASE)
+        return html_regex
+    def parse(self, plain_data):
+        return [
+            {
+                'text': re.sub(self._html_regex, '', unescape(xml_element.text)),
+                'start': float(xml_element.attrib['start']),
+                'duration': float(xml_element.attrib.get('dur', '0.0')),
+            }
+            for xml_element in ElementTree.fromstring(plain_data)
+            if xml_element.text is not None
+        ]
+WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
+class transcriber(object):
+    @classmethod
+    def list_transcripts(cls, video_id, proxies=None, cookies=None):
+        with requests.Session() as http_client:
+            if cookies:
+                http_client.cookies = cls._load_cookies(cookies, video_id)
+            http_client.proxies = proxies if proxies else {}
+            return TranscriptListFetcher(http_client).fetch(video_id)
+    @classmethod
+    def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
+                        cookies=None, preserve_formatting=False):
+        assert isinstance(video_ids, list), "`video_ids` must be a list of strings"
+        data = {}
+        unretrievable_videos = []
+        for video_id in video_ids:
+            try:
+                data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting)
+            except Exception as exception:
+                if not continue_after_error:
+                    raise exception
+                unretrievable_videos.append(video_id)
+        return data, unretrievable_videos
+    @classmethod
+    def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False):
+        assert isinstance(video_id, str), "`video_id` must be a string"
+        return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch(preserve_formatting=preserve_formatting)
+    @classmethod
+    def _load_cookies(cls, cookies, video_id):
+        try:
+            cookie_jar = cookiejar.MozillaCookieJar()
+            cookie_jar.load(cookies)
+            if not cookie_jar:
+                raise CookiesInvalidError(video_id)
+            return cookie_jar
+        except:
+            raise CookiePathInvalidError(video_id)