"""Functions for working with URLs. Contains implementations of functions from :mod:`urllib.parse` that handle bytes and strings. """ from __future__ import annotations import codecs import os import re import typing as t import warnings from urllib.parse import quote from urllib.parse import unquote from urllib.parse import urlencode from urllib.parse import urlsplit from urllib.parse import urlunsplit from ._internal import _check_str_tuple from ._internal import _decode_idna from ._internal import _make_encode_wrapper from ._internal import _to_str from .datastructures import iter_multi_items if t.TYPE_CHECKING: from . import datastructures as ds # A regular expression for what a valid schema looks like _scheme_re = re.compile(r"^[a-zA-Z0-9+-.]+$") # Characters that are safe in any part of an URL. _always_safe_chars = ( "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "0123456789" "-._~" "$!'()*+,;" # RFC3986 sub-delims set, not including query string delimiters &= ) _always_safe = frozenset(_always_safe_chars.encode("ascii")) _hexdigits = "0123456789ABCDEFabcdef" _hextobyte = { f"{a}{b}".encode("ascii"): int(f"{a}{b}", 16) for a in _hexdigits for b in _hexdigits } _bytetohex = [f"%{char:02X}".encode("ascii") for char in range(256)] class _URLTuple(t.NamedTuple): scheme: str netloc: str path: str query: str fragment: str class BaseURL(_URLTuple): """Superclass of :py:class:`URL` and :py:class:`BytesURL`. .. deprecated:: 2.3 Will be removed in Werkzeug 3.0. Use the ``urllib.parse`` library instead. """ __slots__ = () _at: str _colon: str _lbracket: str _rbracket: str def __new__(cls, *args: t.Any, **kwargs: t.Any) -> BaseURL: warnings.warn( f"'werkzeug.urls.{cls.__name__}' is deprecated and will be removed in" " Werkzeug 3.0. Use the 'urllib.parse' library instead.", DeprecationWarning, stacklevel=2, ) return super().__new__(cls, *args, **kwargs) def __str__(self) -> str: return self.to_url() def replace(self, **kwargs: t.Any) -> BaseURL: """Return an URL with the same values, except for those parameters given new values by whichever keyword arguments are specified.""" return self._replace(**kwargs) @property def host(self) -> str | None: """The host part of the URL if available, otherwise `None`. The host is either the hostname or the IP address mentioned in the URL. It will not contain the port. """ return self._split_host()[0] @property def ascii_host(self) -> str | None: """Works exactly like :attr:`host` but will return a result that is restricted to ASCII. If it finds a netloc that is not ASCII it will attempt to idna decode it. This is useful for socket operations when the URL might include internationalized characters. """ rv = self.host if rv is not None and isinstance(rv, str): try: rv = rv.encode("idna").decode("ascii") except UnicodeError: pass return rv @property def port(self) -> int | None: """The port in the URL as an integer if it was present, `None` otherwise. This does not fill in default ports. """ try: rv = int(_to_str(self._split_host()[1])) if 0 <= rv <= 65535: return rv except (ValueError, TypeError): pass return None @property def auth(self) -> str | None: """The authentication part in the URL if available, `None` otherwise. """ return self._split_netloc()[0] @property def username(self) -> str | None: """The username if it was part of the URL, `None` otherwise. This undergoes URL decoding and will always be a string. """ rv = self._split_auth()[0] if rv is not None: return _url_unquote_legacy(rv) return None @property def raw_username(self) -> str | None: """The username if it was part of the URL, `None` otherwise. Unlike :attr:`username` this one is not being decoded. """ return self._split_auth()[0] @property def password(self) -> str | None: """The password if it was part of the URL, `None` otherwise. This undergoes URL decoding and will always be a string. """ rv = self._split_auth()[1] if rv is not None: return _url_unquote_legacy(rv) return None @property def raw_password(self) -> str | None: """The password if it was part of the URL, `None` otherwise. Unlike :attr:`password` this one is not being decoded. """ return self._split_auth()[1] def decode_query(self, *args: t.Any, **kwargs: t.Any) -> ds.MultiDict[str, str]: """Decodes the query part of the URL. Ths is a shortcut for calling :func:`url_decode` on the query argument. The arguments and keyword arguments are forwarded to :func:`url_decode` unchanged. """ return url_decode(self.query, *args, **kwargs) def join(self, *args: t.Any, **kwargs: t.Any) -> BaseURL: """Joins this URL with another one. This is just a convenience function for calling into :meth:`url_join` and then parsing the return value again. """ return url_parse(url_join(self, *args, **kwargs)) def to_url(self) -> str: """Returns a URL string or bytes depending on the type of the information stored. This is just a convenience function for calling :meth:`url_unparse` for this URL. """ return url_unparse(self) def encode_netloc(self) -> str: """Encodes the netloc part to an ASCII safe URL as bytes.""" rv = self.ascii_host or "" if ":" in rv: rv = f"[{rv}]" port = self.port if port is not None: rv = f"{rv}:{port}" auth = ":".join( filter( None, [ url_quote(self.raw_username or "", "utf-8", "strict", "/:%"), url_quote(self.raw_password or "", "utf-8", "strict", "/:%"), ], ) ) if auth: rv = f"{auth}@{rv}" return rv def decode_netloc(self) -> str: """Decodes the netloc part into a string.""" host = self.host or "" if isinstance(host, bytes): host = host.decode() rv = _decode_idna(host) if ":" in rv: rv = f"[{rv}]" port = self.port if port is not None: rv = f"{rv}:{port}" auth = ":".join( filter( None, [ _url_unquote_legacy(self.raw_username or "", "/:%@"), _url_unquote_legacy(self.raw_password or "", "/:%@"), ], ) ) if auth: rv = f"{auth}@{rv}" return rv def to_uri_tuple(self) -> BaseURL: """Returns a :class:`BytesURL` tuple that holds a URI. This will encode all the information in the URL properly to ASCII using the rules a web browser would follow. It's usually more interesting to directly call :meth:`iri_to_uri` which will return a string. """ return url_parse(iri_to_uri(self)) def to_iri_tuple(self) -> BaseURL: """Returns a :class:`URL` tuple that holds a IRI. This will try to decode as much information as possible in the URL without losing information similar to how a web browser does it for the URL bar. It's usually more interesting to directly call :meth:`uri_to_iri` which will return a string. """ return url_parse(uri_to_iri(self)) def get_file_location( self, pathformat: str | None = None ) -> tuple[str | None, str | None]: """Returns a tuple with the location of the file in the form ``(server, location)``. If the netloc is empty in the URL or points to localhost, it's represented as ``None``. The `pathformat` by default is autodetection but needs to be set when working with URLs of a specific system. The supported values are ``'windows'`` when working with Windows or DOS paths and ``'posix'`` when working with posix paths. If the URL does not point to a local file, the server and location are both represented as ``None``. :param pathformat: The expected format of the path component. Currently ``'windows'`` and ``'posix'`` are supported. Defaults to ``None`` which is autodetect. """ if self.scheme != "file": return None, None path = url_unquote(self.path) host = self.netloc or None if pathformat is None: if os.name == "nt": pathformat = "windows" else: pathformat = "posix" if pathformat == "windows": if path[:1] == "/" and path[1:2].isalpha() and path[2:3] in "|:": path = f"{path[1:2]}:{path[3:]}" windows_share = path[:3] in ("\\" * 3, "/" * 3) import ntpath path = ntpath.normpath(path) # Windows shared drives are represented as ``\\host\\directory``. # That results in a URL like ``file://///host/directory``, and a # path like ``///host/directory``. We need to special-case this # because the path contains the hostname. if windows_share and host is None: parts = path.lstrip("\\").split("\\", 1) if len(parts) == 2: host, path = parts else: host = parts[0] path = "" elif pathformat == "posix": import posixpath path = posixpath.normpath(path) else: raise TypeError(f"Invalid path format {pathformat!r}") if host in ("127.0.0.1", "::1", "localhost"): host = None return host, path def _split_netloc(self) -> tuple[str | None, str]: if self._at in self.netloc: auth, _, netloc = self.netloc.partition(self._at) return auth, netloc return None, self.netloc def _split_auth(self) -> tuple[str | None, str | None]: auth = self._split_netloc()[0] if not auth: return None, None if self._colon not in auth: return auth, None username, _, password = auth.partition(self._colon) return username, password def _split_host(self) -> tuple[str | None, str | None]: rv = self._split_netloc()[1] if not rv: return None, None if not rv.startswith(self._lbracket): if self._colon in rv: host, _, port = rv.partition(self._colon) return host, port return rv, None idx = rv.find(self._rbracket) if idx < 0: return rv, None host = rv[1:idx] rest = rv[idx + 1 :] if rest.startswith(self._colon): return host, rest[1:] return host, None class URL(BaseURL): """Represents a parsed URL. This behaves like a regular tuple but also has some extra attributes that give further insight into the URL. .. deprecated:: 2.3 Will be removed in Werkzeug 3.0. Use the ``urllib.parse`` library instead. """ __slots__ = () _at = "@" _colon = ":" _lbracket = "[" _rbracket = "]" def encode(self, charset: str = "utf-8", errors: str = "replace") -> BytesURL: """Encodes the URL to a tuple made out of bytes. The charset is only being used for the path, query and fragment. """ return BytesURL( self.scheme.encode("ascii"), self.encode_netloc(), self.path.encode(charset, errors), self.query.encode(charset, errors), self.fragment.encode(charset, errors), ) class BytesURL(BaseURL): """Represents a parsed URL in bytes. .. deprecated:: 2.3 Will be removed in Werkzeug 3.0. Use the ``urllib.parse`` library instead. """ __slots__ = () _at = b"@" # type: ignore _colon = b":" # type: ignore _lbracket = b"[" # type: ignore _rbracket = b"]" # type: ignore def __str__(self) -> str: return self.to_url().decode("utf-8", "replace") # type: ignore def encode_netloc(self) -> bytes: # type: ignore """Returns the netloc unchanged as bytes.""" return self.netloc # type: ignore def decode(self, charset: str = "utf-8", errors: str = "replace") -> URL: """Decodes the URL to a tuple made out of strings. The charset is only being used for the path, query and fragment. """ return URL( self.scheme.decode("ascii"), # type: ignore self.decode_netloc(), self.path.decode(charset, errors), # type: ignore self.query.decode(charset, errors), # type: ignore self.fragment.decode(charset, errors), # type: ignore ) _unquote_maps: dict[frozenset[int], dict[bytes, int]] = {frozenset(): _hextobyte} def _unquote_to_bytes(string: str | bytes, unsafe: str | bytes = "") -> bytes: if isinstance(string, str): string = string.encode("utf-8") if isinstance(unsafe, str): unsafe = unsafe.encode("utf-8") unsafe = frozenset(bytearray(unsafe)) groups = iter(string.split(b"%")) result = bytearray(next(groups, b"")) try: hex_to_byte = _unquote_maps[unsafe] except KeyError: hex_to_byte = _unquote_maps[unsafe] = { h: b for h, b in _hextobyte.items() if b not in unsafe } for group in groups: code = group[:2] if code in hex_to_byte: result.append(hex_to_byte[code]) result.extend(group[2:]) else: result.append(37) # % result.extend(group) return bytes(result) def _url_encode_impl( obj: t.Mapping[str, str] | t.Iterable[tuple[str, str]], charset: str, sort: bool, key: t.Callable[[tuple[str, str]], t.Any] | None, ) -> t.Iterator[str]: from .datastructures import iter_multi_items iterable: t.Iterable[tuple[str, str]] = iter_multi_items(obj) if sort: iterable = sorted(iterable, key=key) for key_str, value_str in iterable: if value_str is None: continue if not isinstance(key_str, bytes): key_bytes = str(key_str).encode(charset) else: key_bytes = key_str if not isinstance(value_str, bytes): value_bytes = str(value_str).encode(charset) else: value_bytes = value_str yield f"{_fast_url_quote_plus(key_bytes)}={_fast_url_quote_plus(value_bytes)}" def _url_unquote_legacy(value: str, unsafe: str = "") -> str: try: return url_unquote(value, charset="utf-8", errors="strict", unsafe=unsafe) except UnicodeError: return url_unquote(value, charset="latin1", unsafe=unsafe) def url_parse( url: str, scheme: str | None = None, allow_fragments: bool = True ) -> BaseURL: """Parses a URL from a string into a :class:`URL` tuple. If the URL is lacking a scheme it can be provided as second argument. Otherwise, it is ignored. Optionally fragments can be stripped from the URL by setting `allow_fragments` to `False`. The inverse of this function is :func:`url_unparse`. :param url: the URL to parse. :param scheme: the default schema to use if the URL is schemaless. :param allow_fragments: if set to `False` a fragment will be removed from the URL. .. deprecated:: 2.3 Will be removed in Werkzeug 3.0. Use ``urllib.parse.urlsplit`` instead. """ warnings.warn( "'werkzeug.urls.url_parse' is deprecated and will be removed in Werkzeug 3.0." " Use 'urllib.parse.urlsplit' instead.", DeprecationWarning, stacklevel=2, ) s = _make_encode_wrapper(url) is_text_based = isinstance(url, str) if scheme is None: scheme = s("") netloc = query = fragment = s("") i = url.find(s(":")) if i > 0 and _scheme_re.match(_to_str(url[:i], errors="replace")): # make sure "iri" is not actually a port number (in which case # "scheme" is really part of the path) rest = url[i + 1 :] if not rest or any(c not in s("0123456789") for c in rest): # not a port number scheme, url = url[:i].lower(), rest if url[:2] == s("//"): delim = len(url) for c in s("/?#"): wdelim = url.find(c, 2) if wdelim >= 0: delim = min(delim, wdelim) netloc, url = url[2:delim], url[delim:] if (s("[") in netloc and s("]") not in netloc) or ( s("]") in netloc and s("[") not in netloc ): raise ValueError("Invalid IPv6 URL") if allow_fragments and s("#") in url: url, fragment = url.split(s("#"), 1) if s("?") in url: url, query = url.split(s("?"), 1) result_type = URL if is_text_based else BytesURL return result_type(scheme, netloc, url, query, fragment) def _make_fast_url_quote( charset: str = "utf-8", errors: str = "strict", safe: str | bytes = "/:", unsafe: str | bytes = "", ) -> t.Callable[[bytes], str]: """Precompile the translation table for a URL encoding function. Unlike :func:`url_quote`, the generated function only takes the string to quote. :param charset: The charset to encode the result with. :param errors: How to handle encoding errors. :param safe: An optional sequence of safe characters to never encode. :param unsafe: An optional sequence of unsafe characters to always encode. """ if isinstance(safe, str): safe = safe.encode(charset, errors) if isinstance(unsafe, str): unsafe = unsafe.encode(charset, errors) safe = (frozenset(bytearray(safe)) | _always_safe) - frozenset(bytearray(unsafe)) table = [chr(c) if c in safe else f"%{c:02X}" for c in range(256)] def quote(string: bytes) -> str: return "".join([table[c] for c in string]) return quote _fast_url_quote = _make_fast_url_quote() _fast_quote_plus = _make_fast_url_quote(safe=" ", unsafe="+") def _fast_url_quote_plus(string: bytes) -> str: return _fast_quote_plus(string).replace(" ", "+") def url_quote( string: str | bytes, charset: str = "utf-8", errors: str = "strict", safe: str | bytes = "/:", unsafe: str | bytes = "", ) -> str: """URL encode a single string with a given encoding. :param s: the string to quote. :param charset: the charset to be used. :param safe: an optional sequence of safe characters. :param unsafe: an optional sequence of unsafe characters. .. deprecated:: 2.3 Will be removed in Werkzeug 3.0. Use ``urllib.parse.quote`` instead. .. versionadded:: 0.9.2 The `unsafe` parameter was added. """ warnings.warn( "'werkzeug.urls.url_quote' is deprecated and will be removed in Werkzeug 3.0." " Use 'urllib.parse.quote' instead.", DeprecationWarning, stacklevel=2, ) if not isinstance(string, (str, bytes, bytearray)): string = str(string) if isinstance(string, str): string = string.encode(charset, errors) if isinstance(safe, str): safe = safe.encode(charset, errors) if isinstance(unsafe, str): unsafe = unsafe.encode(charset, errors) safe = (frozenset(bytearray(safe)) | _always_safe) - frozenset(bytearray(unsafe)) rv = bytearray() for char in bytearray(string): if char in safe: rv.append(char) else: rv.extend(_bytetohex[char]) return bytes(rv).decode(charset) def url_quote_plus( string: str, charset: str = "utf-8", errors: str = "strict", safe: str = "" ) -> str: """URL encode a single string with the given encoding and convert whitespace to "+". :param s: The string to quote. :param charset: The charset to be used. :param safe: An optional sequence of safe characters. .. deprecated:: 2.3 Will be removed in Werkzeug 3.0. Use ``urllib.parse.quote_plus`` instead. """ warnings.warn( "'werkzeug.urls.url_quote_plus' is deprecated and will be removed in Werkzeug" " 2.4. Use 'urllib.parse.quote_plus' instead.", DeprecationWarning, stacklevel=2, ) return url_quote(string, charset, errors, safe + " ", "+").replace(" ", "+") def url_unparse(components: tuple[str, str, str, str, str]) -> str: """The reverse operation to :meth:`url_parse`. This accepts arbitrary as well as :class:`URL` tuples and returns a URL as a string. :param components: the parsed URL as tuple which should be converted into a URL string. .. deprecated:: 2.3 Will be removed in Werkzeug 3.0. Use ``urllib.parse.urlunsplit`` instead. """ warnings.warn( "'werkzeug.urls.url_unparse' is deprecated and will be removed in Werkzeug 3.0." " Use 'urllib.parse.urlunsplit' instead.", DeprecationWarning, stacklevel=2, ) _check_str_tuple(components) scheme, netloc, path, query, fragment = components s = _make_encode_wrapper(scheme) url = s("") # We generally treat file:///x and file:/x the same which is also # what browsers seem to do. This also allows us to ignore a schema # register for netloc utilization or having to differentiate between # empty and missing netloc. if netloc or (scheme and path.startswith(s("/"))): if path and path[:1] != s("/"): path = s("/") + path url = s("//") + (netloc or s("")) + path elif path: url += path if scheme: url = scheme + s(":") + url if query: url = url + s("?") + query if fragment: url = url + s("#") + fragment return url def url_unquote( s: str | bytes, charset: str = "utf-8", errors: str = "replace", unsafe: str = "", ) -> str: """URL decode a single string with a given encoding. If the charset is set to `None` no decoding is performed and raw bytes are returned. :param s: the string to unquote. :param charset: the charset of the query string. If set to `None` no decoding will take place. :param errors: the error handling for the charset decoding. .. deprecated:: 2.3 Will be removed in Werkzeug 3.0. Use ``urllib.parse.unquote`` instead. """ warnings.warn( "'werkzeug.urls.url_unquote' is deprecated and will be removed in Werkzeug 3.0." " Use 'urllib.parse.unquote' instead.", DeprecationWarning, stacklevel=2, ) rv = _unquote_to_bytes(s, unsafe) if charset is None: return rv return rv.decode(charset, errors) def url_unquote_plus( s: str | bytes, charset: str = "utf-8", errors: str = "replace" ) -> str: """URL decode a single string with the given `charset` and decode "+" to whitespace. Per default encoding errors are ignored. If you want a different behavior you can set `errors` to ``'replace'`` or ``'strict'``. :param s: The string to unquote. :param charset: the charset of the query string. If set to `None` no decoding will take place. :param errors: The error handling for the `charset` decoding. .. deprecated:: 2.3 Will be removed in Werkzeug 3.0. Use ``urllib.parse.unquote_plus`` instead. """ warnings.warn( "'werkzeug.urls.url_unquote_plus' is deprecated and will be removed in Werkzeug" " 2.4. Use 'urllib.parse.unquote_plus' instead.", DeprecationWarning, stacklevel=2, ) if isinstance(s, str): s = s.replace("+", " ") else: s = s.replace(b"+", b" ") return url_unquote(s, charset, errors) def url_fix(s: str, charset: str = "utf-8") -> str: r"""Sometimes you get an URL by a user that just isn't a real URL because it contains unsafe characters like ' ' and so on. This function can fix some of the problems in a similar way browsers handle data entered by the user: >>> url_fix('http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)') 'http://de.wikipedia.org/wiki/Elf%20(Begriffskl%C3%A4rung)' :param s: the string with the URL to fix. :param charset: The target charset for the URL if the url was given as a string. .. deprecated:: 2.3 Will be removed in Werkzeug 3.0. """ warnings.warn( "'werkzeug.urls.url_fix' is deprecated and will be removed in Werkzeug 3.0.", DeprecationWarning, stacklevel=2, ) # First step is to switch to text processing and to convert # backslashes (which are invalid in URLs anyways) to slashes. This is # consistent with what Chrome does. s = _to_str(s, charset, "replace").replace("\\", "/") # For the specific case that we look like a malformed windows URL # we want to fix this up manually: if s.startswith("file://") and s[7:8].isalpha() and s[8:10] in (":/", "|/"): s = f"file:///{s[7:]}" url = url_parse(s) path = url_quote(url.path, charset, safe="/%+$!*'(),") qs = url_quote_plus(url.query, charset, safe=":&%=+$!*'(),") anchor = url_quote_plus(url.fragment, charset, safe=":&%=+$!*'(),") return url_unparse((url.scheme, url.encode_netloc(), path, qs, anchor)) def _codec_error_url_quote(e: UnicodeError) -> tuple[str, int]: """Used in :func:`uri_to_iri` after unquoting to re-quote any invalid bytes. """ # the docs state that UnicodeError does have these attributes, # but mypy isn't picking them up out = quote(e.object[e.start : e.end], safe="") # type: ignore return out, e.end # type: ignore codecs.register_error("werkzeug.url_quote", _codec_error_url_quote) def _make_unquote_part(name: str, chars: str) -> t.Callable[[str, str, str], str]: """Create a function that unquotes all percent encoded characters except those given. This allows working with unquoted characters if possible while not changing the meaning of a given part of a URL. """ choices = "|".join(f"{ord(c):02X}" for c in sorted(chars)) pattern = re.compile(f"((?:%(?:{choices}))+)", re.I) def _unquote_partial(value: str, encoding: str, errors: str) -> str: parts = iter(pattern.split(value)) out = [] for part in parts: out.append(unquote(part, encoding, errors)) out.append(next(parts, "")) return "".join(out) _unquote_partial.__name__ = f"_unquote_{name}" return _unquote_partial # characters that should remain quoted in URL parts # based on https://url.spec.whatwg.org/#percent-encoded-bytes # always keep all controls, space, and % quoted _always_unsafe = bytes((*range(0x21), 0x25, 0x7F)).decode() _unquote_fragment = _make_unquote_part("fragment", _always_unsafe) _unquote_query = _make_unquote_part("query", _always_unsafe + "&=+#") _unquote_path = _make_unquote_part("path", _always_unsafe + "/?#") _unquote_user = _make_unquote_part("user", _always_unsafe + ":@/?#") def uri_to_iri( uri: str | tuple[str, str, str, str, str], charset: str | None = None, errors: str | None = None, ) -> str: """Convert a URI to an IRI. All valid UTF-8 characters are unquoted, leaving all reserved and invalid characters quoted. If the URL has a domain, it is decoded from Punycode. >>> uri_to_iri("http://xn--n3h.net/p%C3%A5th?q=%C3%A8ry%DF") 'http://\\u2603.net/p\\xe5th?q=\\xe8ry%DF' :param uri: The URI to convert. :param charset: The encoding to encode unquoted bytes with. :param errors: Error handler to use during ``bytes.encode``. By default, invalid bytes are left quoted. .. versionchanged:: 2.3 Passing a tuple or bytes, and the ``charset`` and ``errors`` parameters, are deprecated and will be removed in Werkzeug 3.0. .. versionchanged:: 2.3 Which characters remain quoted is specific to each part of the URL. .. versionchanged:: 0.15 All reserved and invalid characters remain quoted. Previously, only some reserved characters were preserved, and invalid bytes were replaced instead of left quoted. .. versionadded:: 0.6 """ if isinstance(uri, tuple): warnings.warn( "Passing a tuple is deprecated and will not be supported in Werkzeug 3.0.", DeprecationWarning, stacklevel=2, ) uri = urlunsplit(uri) if isinstance(uri, bytes): warnings.warn( "Passing bytes is deprecated and will not be supported in Werkzeug 3.0.", DeprecationWarning, stacklevel=2, ) uri = uri.decode() if charset is not None: warnings.warn( "The 'charset' parameter is deprecated and will be removed" " in Werkzeug 3.0.", DeprecationWarning, stacklevel=2, ) else: charset = "utf-8" if errors is not None: warnings.warn( "The 'errors' parameter is deprecated and will be removed in Werkzeug 3.0.", DeprecationWarning, stacklevel=2, ) else: errors = "werkzeug.url_quote" parts = urlsplit(uri) path = _unquote_path(parts.path, charset, errors) query = _unquote_query(parts.query, charset, errors) fragment = _unquote_fragment(parts.fragment, charset, errors) if parts.hostname: netloc = _decode_idna(parts.hostname) else: netloc = "" if ":" in netloc: netloc = f"[{netloc}]" if parts.port: netloc = f"{netloc}:{parts.port}" if parts.username: auth = _unquote_user(parts.username, charset, errors) if parts.password: auth = f"{auth}:{_unquote_user(parts.password, charset, errors)}" netloc = f"{auth}@{netloc}" return urlunsplit((parts.scheme, netloc, path, query, fragment)) def iri_to_uri( iri: str | tuple[str, str, str, str, str], charset: str | None = None, errors: str | None = None, safe_conversion: bool | None = None, ) -> str: """Convert an IRI to a URI. All non-ASCII and unsafe characters are quoted. If the URL has a domain, it is encoded to Punycode. >>> iri_to_uri('http://\\u2603.net/p\\xe5th?q=\\xe8ry%DF') 'http://xn--n3h.net/p%C3%A5th?q=%C3%A8ry%DF' :param iri: The IRI to convert. :param charset: The encoding of the IRI. :param errors: Error handler to use during ``bytes.encode``. .. versionchanged:: 2.3 Passing a tuple or bytes, and the ``charset`` and ``errors`` parameters, are deprecated and will be removed in Werkzeug 3.0. .. versionchanged:: 2.3 Which characters remain unquoted is specific to each part of the URL. .. versionchanged:: 2.3 The ``safe_conversion`` parameter is deprecated and will be removed in Werkzeug 2.4. .. versionchanged:: 0.15 All reserved characters remain unquoted. Previously, only some reserved characters were left unquoted. .. versionchanged:: 0.9.6 The ``safe_conversion`` parameter was added. .. versionadded:: 0.6 """ if charset is not None: warnings.warn( "The 'charset' parameter is deprecated and will be removed" " in Werkzeug 3.0.", DeprecationWarning, stacklevel=2, ) else: charset = "utf-8" if isinstance(iri, tuple): warnings.warn( "Passing a tuple is deprecated and will not be supported in Werkzeug 3.0.", DeprecationWarning, stacklevel=2, ) iri = urlunsplit(iri) if isinstance(iri, bytes): warnings.warn( "Passing bytes is deprecated and will not be supported in Werkzeug 3.0.", DeprecationWarning, stacklevel=2, ) iri = iri.decode(charset) if errors is not None: warnings.warn( "The 'errors' parameter is deprecated and will be removed in Werkzeug 3.0.", DeprecationWarning, stacklevel=2, ) else: errors = "strict" if safe_conversion is not None: warnings.warn( "The 'safe_conversion' parameter is deprecated and will be removed in" " Werkzeug 3.0.", DeprecationWarning, stacklevel=2, ) if safe_conversion: # If we're not sure if it's safe to normalize the URL, and it only contains # ASCII characters, return it as-is. try: ascii_iri = iri.encode("ascii") # Only return if it doesn't have whitespace. (Why?) if len(ascii_iri.split()) == 1: return iri except UnicodeError: pass parts = urlsplit(iri) # safe = https://url.spec.whatwg.org/#url-path-segment-string # as well as percent for things that are already quoted path = quote(parts.path, safe="%!$&'()*+,/:;=@", encoding=charset, errors=errors) query = quote(parts.query, safe="%!$&'()*+,/:;=?@", encoding=charset, errors=errors) fragment = quote( parts.fragment, safe="%!#$&'()*+,/:;=?@", encoding=charset, errors=errors ) if parts.hostname: netloc = parts.hostname.encode("idna").decode("ascii") else: netloc = "" if ":" in netloc: netloc = f"[{netloc}]" if parts.port: netloc = f"{netloc}:{parts.port}" if parts.username: auth = quote(parts.username, safe="%!$&'()*+,;=") if parts.password: pass_quoted = quote(parts.password, safe="%!$&'()*+,;=") auth = f"{auth}:{pass_quoted}" netloc = f"{auth}@{netloc}" return urlunsplit((parts.scheme, netloc, path, query, fragment)) def _invalid_iri_to_uri(iri: str) -> str: """The URL scheme ``itms-services://`` must contain the ``//`` even though it does not have a host component. There may be other invalid schemes as well. Currently, responses will always call ``iri_to_uri`` on the redirect ``Location`` header, which removes the ``//``. For now, if the IRI only contains ASCII and does not contain spaces, pass it on as-is. In Werkzeug 3.0, this should become a ``response.process_location`` flag. :meta private: """ try: iri.encode("ascii") except UnicodeError: pass else: if len(iri.split(None, 1)) == 1: return iri return iri_to_uri(iri) def url_decode( s: t.AnyStr, charset: str = "utf-8", include_empty: bool = True, errors: str = "replace", separator: str = "&", cls: type[ds.MultiDict] | None = None, ) -> ds.MultiDict[str, str]: """Parse a query string and return it as a :class:`MultiDict`. :param s: The query string to parse. :param charset: Decode bytes to string with this charset. If not given, bytes are returned as-is. :param include_empty: Include keys with empty values in the dict. :param errors: Error handling behavior when decoding bytes. :param separator: Separator character between pairs. :param cls: Container to hold result instead of :class:`MultiDict`. .. deprecated:: 2.3 Will be removed in Werkzeug 3.0. Use ``urllib.parse.parse_qs`` instead. .. versionchanged:: 2.1 The ``decode_keys`` parameter was removed. .. versionchanged:: 0.5 In previous versions ";" and "&" could be used for url decoding. Now only "&" is supported. If you want to use ";", a different ``separator`` can be provided. .. versionchanged:: 0.5 The ``cls`` parameter was added. """ warnings.warn( "'werkzeug.urls.url_decode' is deprecated and will be removed in Werkzeug 2.4." " Use 'urllib.parse.parse_qs' instead.", DeprecationWarning, stacklevel=2, ) if cls is None: from .datastructures import MultiDict # noqa: F811 cls = MultiDict if isinstance(s, str) and not isinstance(separator, str): separator = separator.decode(charset or "ascii") elif isinstance(s, bytes) and not isinstance(separator, bytes): separator = separator.encode(charset or "ascii") # type: ignore return cls( _url_decode_impl( s.split(separator), charset, include_empty, errors # type: ignore ) ) def url_decode_stream( stream: t.IO[bytes], charset: str = "utf-8", include_empty: bool = True, errors: str = "replace", separator: bytes = b"&", cls: type[ds.MultiDict] | None = None, limit: int | None = None, ) -> ds.MultiDict[str, str]: """Works like :func:`url_decode` but decodes a stream. The behavior of stream and limit follows functions like :func:`~werkzeug.wsgi.make_line_iter`. The generator of pairs is directly fed to the `cls` so you can consume the data while it's parsed. :param stream: a stream with the encoded querystring :param charset: the charset of the query string. If set to `None` no decoding will take place. :param include_empty: Set to `False` if you don't want empty values to appear in the dict. :param errors: the decoding error behavior. :param separator: the pair separator to be used, defaults to ``&`` :param cls: an optional dict class to use. If this is not specified or `None` the default :class:`MultiDict` is used. :param limit: the content length of the URL data. Not necessary if a limited stream is provided. .. deprecated:: 2.3 Will be removed in Werkzeug 2.4. Use ``urllib.parse.parse_qs`` instead. .. versionchanged:: 2.1 The ``decode_keys`` and ``return_iterator`` parameters were removed. .. versionadded:: 0.8 """ warnings.warn( "'werkzeug.urls.url_decode_stream' is deprecated and will be removed in" " Werkzeug 2.4. Use 'urllib.parse.parse_qs' instead.", DeprecationWarning, stacklevel=2, ) from .wsgi import make_chunk_iter pair_iter = make_chunk_iter(stream, separator, limit) decoder = _url_decode_impl(pair_iter, charset, include_empty, errors) if cls is None: from .datastructures import MultiDict # noqa: F811 cls = MultiDict return cls(decoder) def _url_decode_impl( pair_iter: t.Iterable[t.AnyStr], charset: str, include_empty: bool, errors: str ) -> t.Iterator[tuple[str, str]]: for pair in pair_iter: if not pair: continue s = _make_encode_wrapper(pair) equal = s("=") if equal in pair: key, value = pair.split(equal, 1) else: if not include_empty: continue key = pair value = s("") yield ( url_unquote_plus(key, charset, errors), url_unquote_plus(value, charset, errors), ) def url_encode( obj: t.Mapping[str, str] | t.Iterable[tuple[str, str]], charset: str = "utf-8", sort: bool = False, key: t.Callable[[tuple[str, str]], t.Any] | None = None, separator: str = "&", ) -> str: """URL encode a dict/`MultiDict`. If a value is `None` it will not appear in the result string. Per default only values are encoded into the target charset strings. :param obj: the object to encode into a query string. :param charset: the charset of the query string. :param sort: set to `True` if you want parameters to be sorted by `key`. :param separator: the separator to be used for the pairs. :param key: an optional function to be used for sorting. For more details check out the :func:`sorted` documentation. .. deprecated:: 2.3 Will be removed in Werkzeug 2.4. Use ``urllib.parse.urlencode`` instead. .. versionchanged:: 2.1 The ``encode_keys`` parameter was removed. .. versionchanged:: 0.5 Added the ``sort``, ``key``, and ``separator`` parameters. """ warnings.warn( "'werkzeug.urls.url_encode' is deprecated and will be removed in Werkzeug 2.4." " Use 'urllib.parse.urlencode' instead.", DeprecationWarning, stacklevel=2, ) separator = _to_str(separator, "ascii") return separator.join(_url_encode_impl(obj, charset, sort, key)) def url_encode_stream( obj: t.Mapping[str, str] | t.Iterable[tuple[str, str]], stream: t.IO[str] | None = None, charset: str = "utf-8", sort: bool = False, key: t.Callable[[tuple[str, str]], t.Any] | None = None, separator: str = "&", ) -> None: """Like :meth:`url_encode` but writes the results to a stream object. If the stream is `None` a generator over all encoded pairs is returned. :param obj: the object to encode into a query string. :param stream: a stream to write the encoded object into or `None` if an iterator over the encoded pairs should be returned. In that case the separator argument is ignored. :param charset: the charset of the query string. :param sort: set to `True` if you want parameters to be sorted by `key`. :param separator: the separator to be used for the pairs. :param key: an optional function to be used for sorting. For more details check out the :func:`sorted` documentation. .. deprecated:: 2.3 Will be removed in Werkzeug 2.4. Use ``urllib.parse.urlencode`` instead. .. versionchanged:: 2.1 The ``encode_keys`` parameter was removed. .. versionadded:: 0.8 """ warnings.warn( "'werkzeug.urls.url_encode_stream' is deprecated and will be removed in" " Werkzeug 2.4. Use 'urllib.parse.urlencode' instead.", DeprecationWarning, stacklevel=2, ) separator = _to_str(separator, "ascii") gen = _url_encode_impl(obj, charset, sort, key) if stream is None: return gen # type: ignore for idx, chunk in enumerate(gen): if idx: stream.write(separator) stream.write(chunk) return None def url_join( base: str | tuple[str, str, str, str, str], url: str | tuple[str, str, str, str, str], allow_fragments: bool = True, ) -> str: """Join a base URL and a possibly relative URL to form an absolute interpretation of the latter. :param base: the base URL for the join operation. :param url: the URL to join. :param allow_fragments: indicates whether fragments should be allowed. .. deprecated:: 2.3 Will be removed in Werkzeug 2.4. Use ``urllib.parse.urljoin`` instead. """ warnings.warn( "'werkzeug.urls.url_join' is deprecated and will be removed in Werkzeug 2.4." " Use 'urllib.parse.urljoin' instead.", DeprecationWarning, stacklevel=2, ) if isinstance(base, tuple): base = url_unparse(base) if isinstance(url, tuple): url = url_unparse(url) _check_str_tuple((base, url)) s = _make_encode_wrapper(base) if not base: return url if not url: return base bscheme, bnetloc, bpath, bquery, bfragment = url_parse( base, allow_fragments=allow_fragments ) scheme, netloc, path, query, fragment = url_parse(url, bscheme, allow_fragments) if scheme != bscheme: return url if netloc: return url_unparse((scheme, netloc, path, query, fragment)) netloc = bnetloc if path[:1] == s("/"): segments = path.split(s("/")) elif not path: segments = bpath.split(s("/")) if not query: query = bquery else: segments = bpath.split(s("/"))[:-1] + path.split(s("/")) # If the rightmost part is "./" we want to keep the slash but # remove the dot. if segments[-1] == s("."): segments[-1] = s("") # Resolve ".." and "." segments = [segment for segment in segments if segment != s(".")] while True: i = 1 n = len(segments) - 1 while i < n: if segments[i] == s("..") and segments[i - 1] not in (s(""), s("..")): del segments[i - 1 : i + 1] break i += 1 else: break # Remove trailing ".." if the URL is absolute unwanted_marker = [s(""), s("..")] while segments[:2] == unwanted_marker: del segments[1] path = s("/").join(segments) return url_unparse((scheme, netloc, path, query, fragment)) def _urlencode( query: t.Mapping[str, str] | t.Iterable[tuple[str, str]], encoding: str = "utf-8" ) -> str: items = [x for x in iter_multi_items(query) if x[1] is not None] # safe = https://url.spec.whatwg.org/#percent-encoded-bytes return urlencode(items, safe="!$'()*,/:;?@", encoding=encoding)