forked from mirrors/nixpkgs
d2b225789c
The update-extensions.py script downloads extension information once for each shell version, although one extension version might be used for multiple shell versions. To reduce the amount of downloads we first select the appropriate extension version per shell version as done before, then download the corresponding information once for each selected extension version, and finally map each shell version to the downloaded information of the selected extension version.
302 lines
12 KiB
Python
Executable file
302 lines
12 KiB
Python
Executable file
#!/usr/bin/env nix-shell
|
|
#!nix-shell -I nixpkgs=../../../.. -i python3 -p python3
|
|
|
|
import json
|
|
import urllib.request
|
|
import urllib.error
|
|
from typing import List, Dict, Optional, Any, Tuple
|
|
import logging
|
|
from operator import itemgetter
|
|
import subprocess
|
|
import zipfile
|
|
import io
|
|
import base64
|
|
|
|
# We don't want all those deprecated legacy extensions
|
|
# Group extensions by GNOME "major" version for compatibility reasons
|
|
supported_versions = {
|
|
"38": "3.38",
|
|
"40": "40",
|
|
"41": "41",
|
|
"42": "42",
|
|
}
|
|
|
|
|
|
# Some type alias to increase readility of complex compound types
|
|
PackageName = str
|
|
ShellVersion = str
|
|
Uuid = str
|
|
ExtensionVersion = int
|
|
|
|
|
|
# Keep track of all names that have been used till now to detect collisions.
|
|
# This works because we deterministically process all extensions in historical order
|
|
# The outer dict level is the shell version, as we are tracking duplicates only per same Shell version.
|
|
# key: shell version, value: Dict with key: pname, value: list of UUIDs with that pname
|
|
package_name_registry: Dict[ShellVersion, Dict[PackageName, List[Uuid]]] = {}
|
|
for shell_version in supported_versions.keys():
|
|
package_name_registry[shell_version] = {}
|
|
|
|
|
|
def fetch_extension_data(uuid: str, version: str) -> Tuple[str, str]:
|
|
"""
|
|
Download the extension and hash it. We use `nix-prefetch-url` for this for efficiency reasons.
|
|
Returns a tuple with the hash (Nix-compatible) of the zip file's content and the base64-encoded content of its metadata.json.
|
|
"""
|
|
|
|
# The download URLs follow this schema
|
|
uuid = uuid.replace("@", "")
|
|
url: str = f"https://extensions.gnome.org/extension-data/{uuid}.v{version}.shell-extension.zip"
|
|
|
|
# Yes, we download that file three times:
|
|
|
|
# The first time is for the maintainter, so they may have a personal backup to fix potential issues
|
|
# subprocess.run(
|
|
# ["wget", url], capture_output=True, text=True
|
|
# )
|
|
|
|
# The second time, we extract the metadata.json because we need it too
|
|
with urllib.request.urlopen(url) as response:
|
|
data = zipfile.ZipFile(io.BytesIO(response.read()), 'r')
|
|
metadata = base64.b64encode(data.read('metadata.json')).decode()
|
|
|
|
# The third time is to get the file into the store and to get its hash
|
|
hash = subprocess.run(
|
|
["nix-prefetch-url", "--unpack", url], capture_output=True, text=True
|
|
).stdout.strip()
|
|
|
|
return hash, metadata
|
|
|
|
|
|
def generate_extension_versions(
|
|
extension_version_map: Dict[ShellVersion, ExtensionVersion], uuid: str
|
|
) -> Dict[ShellVersion, Dict[str, str]]:
|
|
"""
|
|
Takes in a mapping from shell versions to extension versions and transforms it the way we need it:
|
|
- Only take one extension version per GNOME Shell major version (as per `supported_versions`)
|
|
- Filter out versions that only support old GNOME versions
|
|
- Download the extension and hash it
|
|
"""
|
|
|
|
# Determine extension version per shell version
|
|
extension_versions: Dict[ShellVersion, ExtensionVersion] = {}
|
|
for shell_version, version_prefix in supported_versions.items():
|
|
# Newest compatible extension version
|
|
extension_version: Optional[int] = max(
|
|
(
|
|
int(ext_ver)
|
|
for shell_ver, ext_ver in extension_version_map.items()
|
|
if (shell_ver.startswith(version_prefix))
|
|
),
|
|
default=None,
|
|
)
|
|
# Extension is not compatible with this GNOME version
|
|
if not extension_version:
|
|
continue
|
|
|
|
extension_versions[shell_version] = extension_version
|
|
|
|
# Download information once for all extension versions chosen above
|
|
extension_info_cache: Dict[ExtensionVersion, Tuple[str, str]] = {}
|
|
for extension_version in sorted(set(extension_versions.values())):
|
|
logging.debug(
|
|
f"[{uuid}] Downloading v{extension_version}"
|
|
)
|
|
extension_info_cache[extension_version] = \
|
|
fetch_extension_data(uuid, str(extension_version))
|
|
|
|
# Fill map
|
|
extension_versions_full: Dict[ShellVersion, Dict[str, str]] = {}
|
|
for shell_version, extension_version in extension_versions.items():
|
|
sha256, metadata = extension_info_cache[extension_version]
|
|
|
|
extension_versions_full[shell_version] = {
|
|
"version": str(extension_version),
|
|
"sha256": sha256,
|
|
# The downloads are impure, their metadata.json may change at any time.
|
|
# Thus, be back it up / pin it to remain deterministic
|
|
# Upstream issue: https://gitlab.gnome.org/Infrastructure/extensions-web/-/issues/137
|
|
"metadata": metadata,
|
|
}
|
|
return extension_versions_full
|
|
|
|
|
|
def pname_from_url(url: str) -> Tuple[str, str]:
|
|
"""
|
|
Parse something like "/extension/1475/battery-time/" and output ("battery-time", "1475")
|
|
"""
|
|
|
|
url = url.split("/") # type: ignore
|
|
return (url[3], url[2])
|
|
|
|
|
|
def process_extension(extension: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Process an extension. It takes in raw scraped data and downloads all the necessary information that buildGnomeExtension.nix requires
|
|
|
|
Input: a json object of one extension queried from the site. It has the following schema (only important key listed):
|
|
{
|
|
"uuid": str,
|
|
"name": str,
|
|
"description": str,
|
|
"link": str,
|
|
"shell_version_map": {
|
|
str: { "version": int, … },
|
|
…
|
|
},
|
|
…
|
|
}
|
|
|
|
"uuid" is an extension UUID that looks like this (most of the time): "extension-name@username.domain.tld".
|
|
Don't make any assumptions on it, and treat it like an opaque string!
|
|
"link" follows the following schema: "/extension/$number/$string/"
|
|
The number is monotonically increasing and unique to every extension.
|
|
The string is usually derived from the extensions's name (but shortened, kebab-cased and URL friendly).
|
|
It may diverge from the actual name.
|
|
The keys of "shell_version_map" are GNOME Shell version numbers.
|
|
|
|
Output: a json object to be stored, or None if the extension should be skipped. Schema:
|
|
{
|
|
"uuid": str,
|
|
"name": str,
|
|
"pname": str,
|
|
"description": str,
|
|
"link": str,
|
|
"shell_version_map": {
|
|
str: { "version": int, "sha256": str, "metadata": <hex> },
|
|
…
|
|
}
|
|
}
|
|
|
|
Only "uuid" gets passed along unmodified. "name", "description" and "link" are taken from the input, but sanitized.
|
|
"pname" gets generated from other fields and "shell_version_map" has a completely different structure than the input
|
|
field with the same name.
|
|
"""
|
|
uuid = extension["uuid"]
|
|
|
|
# Yeah, there are some extensions without any releases
|
|
if not extension["shell_version_map"]:
|
|
return None
|
|
logging.info(f"Processing '{uuid}'")
|
|
|
|
# Input is a mapping str -> { version: int, … }
|
|
# We want to map shell versions to extension versions
|
|
shell_version_map: Dict[ShellVersion, int] = {
|
|
k: v["version"] for k, v in extension["shell_version_map"].items()
|
|
}
|
|
# Transform shell_version_map to be more useful for us. Also throw away unwanted versions
|
|
shell_version_map: Dict[ShellVersion, Dict[str, str]] = generate_extension_versions(shell_version_map, uuid) # type: ignore
|
|
|
|
# No compatible versions found
|
|
if not shell_version_map:
|
|
return None
|
|
|
|
# Fetch a human-readable name for the package.
|
|
(pname, _pname_id) = pname_from_url(extension["link"])
|
|
|
|
for shell_version in shell_version_map.keys():
|
|
if pname in package_name_registry[shell_version]:
|
|
logging.warning(f"Package name '{pname}' is colliding.")
|
|
package_name_registry[shell_version][pname].append(uuid)
|
|
else:
|
|
package_name_registry[shell_version][pname] = [uuid]
|
|
|
|
return {
|
|
"uuid": uuid,
|
|
"name": extension["name"],
|
|
"pname": pname,
|
|
"description": extension["description"],
|
|
"link": "https://extensions.gnome.org" + extension["link"],
|
|
"shell_version_map": shell_version_map,
|
|
}
|
|
|
|
|
|
def scrape_extensions_index() -> List[Dict[str, Any]]:
|
|
"""
|
|
Scrape the list of extensions by sending search queries to the API. We simply go over it
|
|
page by page until we hit a non-full page or a 404 error.
|
|
|
|
The returned list is sorted by the age of the extension, in order to be deterministic.
|
|
"""
|
|
page = 0
|
|
extensions = []
|
|
while True:
|
|
page += 1
|
|
logging.info("Scraping page " + str(page))
|
|
try:
|
|
with urllib.request.urlopen(
|
|
f"https://extensions.gnome.org/extension-query/?n_per_page=25&page={page}"
|
|
) as response:
|
|
data = json.loads(response.read().decode())["extensions"]
|
|
responseLength = len(data)
|
|
|
|
for extension in data:
|
|
extensions.append(extension)
|
|
|
|
# If our page isn't "full", it must have been the last one
|
|
if responseLength < 25:
|
|
logging.debug(
|
|
f"\tThis page only has {responseLength} entries, so it must be the last one."
|
|
)
|
|
break
|
|
except urllib.error.HTTPError as e:
|
|
if e.code == 404:
|
|
# We reached past the last page and are done now
|
|
break
|
|
else:
|
|
raise
|
|
|
|
# `pk` is the primary key in the extensions.gnome.org database. Sorting on it will give us a stable,
|
|
# deterministic ordering.
|
|
extensions.sort(key=itemgetter("pk"))
|
|
return extensions
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
|
|
raw_extensions = scrape_extensions_index()
|
|
|
|
logging.info(f"Downloaded {len(raw_extensions)} extensions. Processing …")
|
|
processed_extensions: List[Dict[str, Any]] = []
|
|
for num, raw_extension in enumerate(raw_extensions):
|
|
processed_extension = process_extension(raw_extension)
|
|
if processed_extension:
|
|
processed_extensions.append(processed_extension)
|
|
logging.debug(f"Processed {num + 1} / {len(raw_extensions)}")
|
|
|
|
logging.info(
|
|
f"Done. Writing results to extensions.json ({len(processed_extensions)} extensions in total)"
|
|
)
|
|
|
|
with open("extensions.json", "w") as out:
|
|
# Manually pretty-print the outer level, but then do one compact line per extension
|
|
# This allows for the diffs to be manageable (one line of change per extension) despite their quantity
|
|
for index, extension in enumerate(processed_extensions):
|
|
if index == 0:
|
|
out.write("[ ")
|
|
else:
|
|
out.write(", ")
|
|
json.dump(extension, out, ensure_ascii=False)
|
|
out.write("\n")
|
|
out.write("]\n")
|
|
|
|
with open("extensions.json", "r") as out:
|
|
# Check that the generated file actually is valid JSON, just to be sure
|
|
json.load(out)
|
|
|
|
logging.info(
|
|
"Done. Writing name collisions to collisions.json (please check manually)"
|
|
)
|
|
with open("collisions.json", "w") as out:
|
|
# Filter out those that are not duplicates
|
|
package_name_registry_filtered: Dict[ShellVersion, Dict[PackageName, List[Uuid]]] = {
|
|
# The outer level keys are shell versions
|
|
shell_version: {
|
|
# The inner keys are extension names, with a list of all extensions with that name as value.
|
|
pname: extensions for pname, extensions in collisions.items() if len(extensions) > 1
|
|
} for shell_version, collisions in package_name_registry.items()
|
|
}
|
|
json.dump(package_name_registry_filtered, out, indent=2, ensure_ascii=False)
|
|
out.write("\n")
|