From c9bf4247a81bacb08f6bc38d278851f5e1f1c4aa Mon Sep 17 00:00:00 2001
From: rany2 <rany2@riseup.net>
Date: Fri, 16 Feb 2024 18:36:20 +0200
Subject: [PATCH] Refactor communicate for better readability

Also improve performance on larger documents.

Signed-off-by: rany2 <rany2@riseup.net>
---
 pylintrc                    |  62 +++++--
 src/edge_tts/communicate.py | 341 +++++++++++++++++++-----------------
 2 files changed, 224 insertions(+), 179 deletions(-)

diff --git a/pylintrc b/pylintrc
index 2e6b1c7..2ce822c 100644
--- a/pylintrc
+++ b/pylintrc
@@ -5,6 +5,10 @@
 # only in one or another interpreter, leading to false positives when analysed.
 analyse-fallback-blocks=no
 
+# Clear in-memory caches upon conclusion of linting. Useful if running pylint
+# in a server-like mode.
+clear-cache-post-run=no
+
 # Load and enable all available extensions. Use --list-extensions to see a list
 # all available extensions.
 #enable-all-extensions=
@@ -46,8 +50,8 @@ ignore=CVS
 
 # Add files or directories matching the regular expressions patterns to the
 # ignore-list. The regex matches against paths and can be in Posix or Windows
-# format. Because '\' represents the directory delimiter on Windows systems, it
-# can't be used as an escape character.
+# format. Because '\\' represents the directory delimiter on Windows systems,
+# it can't be used as an escape character.
 ignore-paths=
 
 # Files or directories matching the regular expression patterns are skipped.
@@ -84,11 +88,17 @@ persistent=yes
 
 # Minimum Python version to use for version dependent checks. Will default to
 # the version used to run pylint.
-py-version=3.10
+py-version=3.11
 
 # Discover python modules and packages in the file system subtree.
 recursive=no
 
+# Add paths to the list of the source roots. Supports globbing patterns. The
+# source root is an absolute path or a path relative to the current working
+# directory used to determine a package namespace for modules located under the
+# source root.
+source-roots=
+
 # When enabled, pylint would attempt to guess common misconfiguration and emit
 # user-friendly hints instead of false-positive error messages.
 suggestion-mode=yes
@@ -224,6 +234,10 @@ no-docstring-rgx=^_
 # These decorators are taken in consideration only for invalid-name.
 property-classes=abc.abstractproperty
 
+# Regular expression matching correct type alias names. If left empty, type
+# alias names will be checked with the set naming style.
+#typealias-rgx=
+
 # Regular expression matching correct type variable names. If left empty, type
 # variable names will be checked with the set naming style.
 #typevar-rgx=
@@ -246,21 +260,18 @@ check-protected-access-in-special-methods=no
 defining-attr-methods=__init__,
                       __new__,
                       setUp,
+                      asyncSetUp,
                       __post_init__
 
 # List of member names, which should be excluded from the protected access
 # warning.
-exclude-protected=_asdict,
-                  _fields,
-                  _replace,
-                  _source,
-                  _make
+exclude-protected=_asdict,_fields,_replace,_source,_make,os._exit
 
 # List of valid names for the first argument in a class method.
 valid-classmethod-first-arg=cls
 
 # List of valid names for the first argument in a metaclass class method.
-valid-metaclass-classmethod-first-arg=cls
+valid-metaclass-classmethod-first-arg=mcs
 
 
 [DESIGN]
@@ -274,7 +285,7 @@ exclude-too-few-public-methods=
 ignored-parents=
 
 # Maximum number of arguments for function / method.
-max-args=5
+max-args=10
 
 # Maximum number of attributes for a class (see R0902).
 max-attributes=7
@@ -307,8 +318,7 @@ min-public-methods=2
 [EXCEPTIONS]
 
 # Exceptions that will emit a warning when caught.
-overgeneral-exceptions=builtins.BaseException,
-                       builtins.Exception
+overgeneral-exceptions=builtins.BaseException,builtins.Exception
 
 
 [FORMAT]
@@ -327,7 +337,7 @@ indent-after-paren=4
 indent-string='    '
 
 # Maximum number of characters on a single line.
-max-line-length=240
+max-line-length=100
 
 # Maximum number of lines in a module.
 max-module-lines=1000
@@ -347,6 +357,9 @@ single-line-if-stmt=no
 # one.
 allow-any-import-level=
 
+# Allow explicit reexports by alias from a package __init__.
+allow-reexport-from-package=no
+
 # Allow wildcard imports from modules that define __all__.
 allow-wildcard-with-all=no
 
@@ -408,14 +421,24 @@ confidence=HIGH,
 # --enable=similarities". If you want to run only the classes checker, but have
 # no Warning level messages displayed, use "--disable=all --enable=classes
 # --disable=W".
-disable=duplicate-code,
+disable=raw-checker-failed,
+        bad-inline-option,
+        locally-disabled,
+        file-ignored,
+        suppressed-message,
+        useless-suppression,
+        deprecated-pragma,
+        use-symbolic-message-instead,
+        use-implicit-booleaness-not-comparison-to-string,
+        use-implicit-booleaness-not-comparison-to-zero,
+        duplicate-code,
         consider-using-with
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
 # multiple time (only on the command line, not in the configuration file where
 # it should appear only once). See also the "--disable" option for examples.
-enable=c-extension-no-member
+enable=
 
 
 [METHOD_ARGS]
@@ -461,8 +484,9 @@ evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor
 # used to format the message information. See doc for all details.
 msg-template=
 
-# Set the output format. Available formats are text, parseable, colorized, json
-# and msvs (visual studio). You can also give a reporter class, e.g.
+# Set the output format. Available formats are: text, parseable, colorized,
+# json2 (improved json format), json (old json format) and msvs (visual
+# studio). You can also give a reporter class, e.g.
 # mypackage.mymodule.MyReporterClass.
 #output-format=
 
@@ -496,8 +520,8 @@ min-similarity-lines=4
 # Limits count of emitted suggestions for spelling mistakes.
 max-spelling-suggestions=4
 
-# Spelling dictionary name. Available dictionaries: none. To make it work,
-# install the 'python-enchant' package.
+# Spelling dictionary name. No available dictionaries : You need to install
+# both the python package and the system dependency for enchant to work.
 spelling-dict=
 
 # List of comma separated words that should be considered directives if they
diff --git a/src/edge_tts/communicate.py b/src/edge_tts/communicate.py
index 06cc145..e597c18 100644
--- a/src/edge_tts/communicate.py
+++ b/src/edge_tts/communicate.py
@@ -229,6 +229,25 @@ class Communicate:
     Class for communicating with the service.
     """
 
+    @staticmethod
+    def validate_string_param(param_name: str, param_value: str, pattern: str) -> str:
+        """
+        Validates the given string parameter based on type and pattern.
+
+        Args:
+            param_name (str): The name of the parameter.
+            param_value (str): The value of the parameter.
+            pattern (str): The pattern to validate the parameter against.
+
+        Returns:
+            str: The validated parameter.
+        """
+        if not isinstance(param_value, str):
+            raise TypeError(f"{param_name} must be str")
+        if re.match(pattern, param_value) is None:
+            raise ValueError(f"Invalid {param_name} '{param_value}'.")
+        return param_value
+
     def __init__(
         self,
         text: str,
@@ -238,6 +257,7 @@ class Communicate:
         volume: str = "+0%",
         pitch: str = "+0Hz",
         proxy: Optional[str] = None,
+        receive_timeout: int = 5,
     ):
         """
         Initializes the Communicate class.
@@ -270,190 +290,191 @@ class Communicate:
                 + f" ({lang}-{region}, {name})"
             )
 
-        if (
-            re.match(
-                r"^Microsoft Server Speech Text to Speech Voice \(.+,.+\)$",
-                self.voice,
-            )
-            is None
-        ):
-            raise ValueError(f"Invalid voice '{voice}'.")
-
-        if not isinstance(rate, str):
-            raise TypeError("rate must be str")
-        if re.match(r"^[+-]\d+%$", rate) is None:
-            raise ValueError(f"Invalid rate '{rate}'.")
-        self.rate: str = rate
-
-        if not isinstance(volume, str):
-            raise TypeError("volume must be str")
-        if re.match(r"^[+-]\d+%$", volume) is None:
-            raise ValueError(f"Invalid volume '{volume}'.")
-        self.volume: str = volume
-
-        if not isinstance(pitch, str):
-            raise TypeError("pitch must be str")
-        if re.match(r"^[+-]\d+Hz$", pitch) is None:
-            raise ValueError(f"Invalid pitch '{pitch}'.")
-        self.pitch: str = pitch
+        self.voice = self.validate_string_param(
+            "voice",
+            self.voice,
+            r"^Microsoft Server Speech Text to Speech Voice \(.+,.+\)$",
+        )
+        self.rate = self.validate_string_param("rate", rate, r"^[+-]\d+%$")
+        self.volume = self.validate_string_param("volume", volume, r"^[+-]\d+%$")
+        self.pitch = self.validate_string_param("pitch", pitch, r"^[+-]\d+Hz$")
 
         if proxy is not None and not isinstance(proxy, str):
             raise TypeError("proxy must be str")
         self.proxy: Optional[str] = proxy
 
+        if not isinstance(receive_timeout, int):
+            raise TypeError("receive_timeout must be int")
+        self.receive_timeout: int = receive_timeout
+
     async def stream(self) -> AsyncGenerator[Dict[str, Any], None]:
         """Streams audio and metadata from the service."""
 
+        async def send_command_request() -> None:
+            """Sends the request to the service."""
+
+            # Prepare the request to be sent to the service.
+            #
+            # Note sentenceBoundaryEnabled and wordBoundaryEnabled are actually supposed
+            # to be booleans, but Edge Browser seems to send them as strings.
+            #
+            # This is a bug in Edge as Azure Cognitive Services actually sends them as
+            # bool and not string. For now I will send them as bool unless it causes
+            # any problems.
+            #
+            # Also pay close attention to double { } in request (escape for f-string).
+            await websocket.send_str(
+                f"X-Timestamp:{date_to_string()}\r\n"
+                "Content-Type:application/json; charset=utf-8\r\n"
+                "Path:speech.config\r\n\r\n"
+                '{"context":{"synthesis":{"audio":{"metadataoptions":{'
+                '"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":true},'
+                '"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
+                "}}}}\r\n"
+            )
+
+        async def send_ssml_request() -> bool:
+            """Sends the SSML request to the service."""
+
+            # Get the next string from the generator.
+            text = next(texts, None)
+
+            # If there are no more strings, return False.
+            if text is None:
+                return False
+
+            # Send the request to the service and return True.
+            await websocket.send_str(
+                ssml_headers_plus_data(
+                    connect_id(),
+                    date_to_string(),
+                    mkssml(text, self.voice, self.rate, self.volume, self.pitch),
+                )
+            )
+            return True
+
+        def parse_metadata():
+            for meta_obj in json.loads(data)["Metadata"]:
+                meta_type = meta_obj["Type"]
+                if meta_type == "WordBoundary":
+                    current_offset = meta_obj["Data"]["Offset"] + offset_compensation
+                    current_duration = meta_obj["Data"]["Duration"]
+                    return {
+                        "type": meta_type,
+                        "offset": current_offset,
+                        "duration": current_duration,
+                        "text": meta_obj["Data"]["text"]["Text"],
+                    }
+                elif meta_type in ("SessionEnd",):
+                    continue
+                else:
+                    raise UnknownResponse(f"Unknown metadata type: {meta_type}")
+
+        # Split the text into multiple strings if it is too long for the service.
         texts = split_text_by_byte_length(
             escape(remove_incompatible_characters(self.text)),
             calc_max_mesg_size(self.voice, self.rate, self.volume, self.pitch),
         )
-        final_utterance: Dict[int, int] = {}
-        prev_idx = -1
-        shift_time = -1
 
+        # Keep track of last duration + offset to calculate the offset
+        # upon word split.
+        last_duration_offset = 0
+
+        # Current offset compensations.
+        offset_compensation = 0
+
+        # Create a new connection to the service.
         ssl_ctx = ssl.create_default_context(cafile=certifi.where())
-        for idx, text in enumerate(texts):
-            async with aiohttp.ClientSession(
-                trust_env=True,
-            ) as session, session.ws_connect(
-                f"{WSS_URL}&ConnectionId={connect_id()}",
-                compress=15,
-                autoclose=True,
-                autoping=True,
-                proxy=self.proxy,
-                headers={
-                    "Pragma": "no-cache",
-                    "Cache-Control": "no-cache",
-                    "Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
-                    "Accept-Encoding": "gzip, deflate, br",
-                    "Accept-Language": "en-US,en;q=0.9",
-                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-                    " (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
-                },
-                ssl=ssl_ctx,
-            ) as websocket:
-                # download indicates whether we should be expecting audio data,
-                # this is so what we avoid getting binary data from the websocket
-                # and falsely thinking it's audio data.
-                download_audio = False
+        async with aiohttp.ClientSession(
+            trust_env=True,
+        ) as session, session.ws_connect(
+            f"{WSS_URL}&ConnectionId={connect_id()}",
+            compress=15,
+            proxy=self.proxy,
+            receive_timeout=self.receive_timeout,
+            headers={
+                "Pragma": "no-cache",
+                "Cache-Control": "no-cache",
+                "Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
+                "Accept-Encoding": "gzip, deflate, br",
+                "Accept-Language": "en-US,en;q=0.9",
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+                " (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
+            },
+            ssl=ssl_ctx,
+        ) as websocket:
+            # audio_was_received indicates whether we have received audio data
+            # from the websocket. This is so we can raise an exception if we
+            # don't receive any audio data.
+            audio_was_received = False
 
-                # audio_was_received indicates whether we have received audio data
-                # from the websocket. This is so we can raise an exception if we
-                # don't receive any audio data.
-                audio_was_received = False
+            # Send the request to the service.
+            await send_command_request()
 
-                # Each message needs to have the proper date.
-                date = date_to_string()
+            # Send the SSML request to the service.
+            await send_ssml_request()
 
-                # Prepare the request to be sent to the service.
-                #
-                # Note sentenceBoundaryEnabled and wordBoundaryEnabled are actually supposed
-                # to be booleans, but Edge Browser seems to send them as strings.
-                #
-                # This is a bug in Edge as Azure Cognitive Services actually sends them as
-                # bool and not string. For now I will send them as bool unless it causes
-                # any problems.
-                #
-                # Also pay close attention to double { } in request (escape for f-string).
-                await websocket.send_str(
-                    f"X-Timestamp:{date}\r\n"
-                    "Content-Type:application/json; charset=utf-8\r\n"
-                    "Path:speech.config\r\n\r\n"
-                    '{"context":{"synthesis":{"audio":{"metadataoptions":{'
-                    '"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":true},'
-                    '"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
-                    "}}}}\r\n"
-                )
+            async for received in websocket:
+                if received.type == aiohttp.WSMsgType.TEXT:
+                    parameters, data = get_headers_and_data(received.data)
+                    path = parameters.get(b"Path")
+                    if path == b"audio.metadata":
+                        # Parse the metadata and yield it.
+                        parsed_metadata = parse_metadata()
+                        yield parsed_metadata
 
-                await websocket.send_str(
-                    ssml_headers_plus_data(
-                        connect_id(),
-                        date,
-                        mkssml(text, self.voice, self.rate, self.volume, self.pitch),
-                    )
-                )
+                        # Update the last duration offset for use by the next SSML request.
+                        last_duration_offset = (
+                            parsed_metadata["offset"] + parsed_metadata["duration"]
+                        )
+                    elif path == b"turn.end":
+                        # Update the offset compensation for the next SSML request.
+                        offset_compensation = last_duration_offset
 
-                async for received in websocket:
-                    if received.type == aiohttp.WSMsgType.TEXT:
-                        parameters, data = get_headers_and_data(received.data)
-                        path = parameters.get(b"Path")
-                        if path == b"turn.start":
-                            download_audio = True
-                        elif path == b"turn.end":
-                            download_audio = False
-                            break  # End of audio data
-                        elif path == b"audio.metadata":
-                            for meta_obj in json.loads(data)["Metadata"]:
-                                meta_type = meta_obj["Type"]
-                                if idx != prev_idx:
-                                    shift_time = sum(
-                                        final_utterance[i] for i in range(idx)
-                                    )
-                                    prev_idx = idx
-                                if meta_type == "WordBoundary":
-                                    final_utterance[idx] = (
-                                        meta_obj["Data"]["Offset"]
-                                        + meta_obj["Data"]["Duration"]
-                                        # Average padding added by the service
-                                        # Alternatively we could use ffmpeg to get value properly
-                                        # but I don't want to add an additional dependency
-                                        # if this is found to work well enough.
-                                        + 8_750_000
-                                    )
-                                    yield {
-                                        "type": meta_type,
-                                        "offset": meta_obj["Data"]["Offset"]
-                                        + shift_time,
-                                        "duration": meta_obj["Data"]["Duration"],
-                                        "text": meta_obj["Data"]["text"]["Text"],
-                                    }
-                                elif meta_type == "SessionEnd":
-                                    continue
-                                else:
-                                    raise UnknownResponse(
-                                        f"Unknown metadata type: {meta_type}"
-                                    )
-                        elif path == b"response":
-                            pass
-                        else:
-                            raise UnknownResponse(
-                                "The response from the service is not recognized.\n"
-                                + received.data
-                            )
-                    elif received.type == aiohttp.WSMsgType.BINARY:
-                        if not download_audio:
-                            raise UnexpectedResponse(
-                                "We received a binary message, but we are not expecting one."
-                            )
+                        # Use average padding typically added by the service
+                        # to the end of the audio data. This seems to work pretty
+                        # well for now, but we might ultimately need to use a
+                        # more sophisticated method like using ffmpeg to get
+                        # the actual duration of the audio data.
+                        offset_compensation += 8_750_000
 
-                        if len(received.data) < 2:
-                            raise UnexpectedResponse(
-                                "We received a binary message, but it is missing the header length."
-                            )
-
-                        # See: https://github.com/microsoft/cognitive-services-speech-sdk-js/blob/d071d11/src/common.speech/WebsocketMessageFormatter.ts#L46
-                        header_length = int.from_bytes(received.data[:2], "big")
-                        if len(received.data) < header_length + 2:
-                            raise UnexpectedResponse(
-                                "We received a binary message, but it is missing the audio data."
-                            )
-
-                        yield {
-                            "type": "audio",
-                            "data": received.data[header_length + 2 :],
-                        }
-                        audio_was_received = True
-                    elif received.type == aiohttp.WSMsgType.ERROR:
-                        raise WebSocketError(
-                            received.data if received.data else "Unknown error"
+                        # Send the next SSML request to the service.
+                        if not await send_ssml_request():
+                            break
+                    elif path in (b"response", b"turn.start"):
+                        pass
+                    else:
+                        raise UnknownResponse(
+                            "The response from the service is not recognized.\n"
+                            + received.data
+                        )
+                elif received.type == aiohttp.WSMsgType.BINARY:
+                    if len(received.data) < 2:
+                        raise UnexpectedResponse(
+                            "We received a binary message, but it is missing the header length."
                         )
 
-                if not audio_was_received:
-                    raise NoAudioReceived(
-                        "No audio was received. Please verify that your parameters are correct."
+                    header_length = int.from_bytes(received.data[:2], "big")
+                    if len(received.data) < header_length + 2:
+                        raise UnexpectedResponse(
+                            "We received a binary message, but it is missing the audio data."
+                        )
+
+                    audio_was_received = True
+                    yield {
+                        "type": "audio",
+                        "data": received.data[header_length + 2 :],
+                    }
+                elif received.type == aiohttp.WSMsgType.ERROR:
+                    raise WebSocketError(
+                        received.data if received.data else "Unknown error"
                     )
 
+            if not audio_was_received:
+                raise NoAudioReceived(
+                    "No audio was received. Please verify that your parameters are correct."
+                )
+
     async def save(
         self,
         audio_fname: Union[str, bytes],