Merge pull request #47411 from graham-at-target/multi-layered-images-crafted

Multi-Layered Docker Images
2018-10-01 09:48:24 +02:00 · 2018-10-01 09:48:24 +02:00 · 56b4db9710
parent c417b2659b fb2d153dac
commit 56b4db9710
8 changed files with 926 additions and 6 deletions
--- a/doc/functions.xml
+++ b/doc/functions.xml
@ -682,6 +682,177 @@ hello        latest   de2bf4786de6   About a minute ago   25.2MB
   </example>
  </section>

+  <section xml:id="ssec-pkgs-dockerTools-buildLayeredImage">
+   <title>buildLayeredImage</title>
+
+   <para>
+    Create a Docker image with many of the store paths being on their own layer
+    to improve sharing between images.
+   </para>
+
+   <variablelist>
+    <varlistentry>
+     <term>
+      <varname>name</varname>
+     </term>
+     <listitem>
+      <para>
+       The name of the resulting image.
+      </para>
+     </listitem>
+    </varlistentry>
+    <varlistentry>
+     <term>
+      <varname>tag</varname> <emphasis>optional</emphasis>
+     </term>
+     <listitem>
+      <para>
+       Tag of the generated image.
+      </para>
+      <para>
+       <emphasis>Default:</emphasis> the output path's hash
+      </para>
+     </listitem>
+    </varlistentry>
+    <varlistentry>
+     <term>
+      <varname>contents</varname> <emphasis>optional</emphasis>
+     </term>
+     <listitem>
+      <para>
+       Top level paths in the container. Either a single derivation, or a list
+       of derivations.
+      </para>
+      <para>
+       <emphasis>Default:</emphasis> <literal>[]</literal>
+      </para>
+     </listitem>
+    </varlistentry>
+    <varlistentry>
+     <term>
+      <varname>config</varname> <emphasis>optional</emphasis>
+     </term>
+     <listitem>
+      <para>
+       Run-time configuration of the container. A full list of the options are
+       available at in the
+       <link xlink:href="https://github.com/moby/moby/blob/master/image/spec/v1.2.md#image-json-field-descriptions">
+       Docker Image Specification v1.2.0 </link>.
+      </para>
+      <para>
+       <emphasis>Default:</emphasis> <literal>{}</literal>
+      </para>
+     </listitem>
+    </varlistentry>
+    <varlistentry>
+     <term>
+      <varname>created</varname> <emphasis>optional</emphasis>
+     </term>
+     <listitem>
+      <para>
+       Date and time the layers were created. Follows the same
+       <literal>now</literal> exception supported by
+       <literal>buildImage</literal>.
+      </para>
+      <para>
+       <emphasis>Default:</emphasis> <literal>1970-01-01T00:00:01Z</literal>
+      </para>
+     </listitem>
+    </varlistentry>
+    <varlistentry>
+     <term>
+      <varname>maxLayers</varname> <emphasis>optional</emphasis>
+     </term>
+     <listitem>
+      <para>
+       Maximum number of layers to create.
+      </para>
+      <para>
+       <emphasis>Default:</emphasis> <literal>24</literal>
+      </para>
+     </listitem>
+    </varlistentry>
+   </variablelist>
+
+   <section xml:id="dockerTools-buildLayeredImage-arg-contents">
+    <title>Behavior of <varname>contents</varname> in the final image</title>
+
+    <para>
+     Each path directly listed in <varname>contents</varname> will have a
+     symlink in the root of the image.
+    </para>
+
+    <para>
+     For example:
+<programlisting><![CDATA[
+pkgs.dockerTools.buildLayeredImage {
+  name = "hello";
+  contents = [ pkgs.hello ];
+}
+]]></programlisting>
+     will create symlinks for all the paths in the <literal>hello</literal>
+     package:
+<screen><![CDATA[
+/bin/hello -> /nix/store/h1zb1padqbbb7jicsvkmrym3r6snphxg-hello-2.10/bin/hello
+/share/info/hello.info -> /nix/store/h1zb1padqbbb7jicsvkmrym3r6snphxg-hello-2.10/share/info/hello.info
+/share/locale/bg/LC_MESSAGES/hello.mo -> /nix/store/h1zb1padqbbb7jicsvkmrym3r6snphxg-hello-2.10/share/locale/bg/LC_MESSAGES/hello.mo
+]]></screen>
+    </para>
+   </section>
+
+   <section xml:id="dockerTools-buildLayeredImage-arg-config">
+    <title>Automatic inclusion of <varname>config</varname> references</title>
+
+    <para>
+     The closure of <varname>config</varname> is automatically included in the
+     closure of the final image.
+    </para>
+
+    <para>
+     This allows you to make very simple Docker images with very little code.
+     This container will start up and run <command>hello</command>:
+<programlisting><![CDATA[
+pkgs.dockerTools.buildLayeredImage {
+  name = "hello";
+  config.Cmd = [ "${pkgs.hello}/bin/hello" ];
+}
+]]></programlisting>
+    </para>
+   </section>
+
+   <section xml:id="dockerTools-buildLayeredImage-arg-maxLayers">
+    <title>Adjusting <varname>maxLayers</varname></title>
+
+    <para>
+     Increasing the <varname>maxLayers</varname> increases the number of layers
+     which have a chance to be shared between different images.
+    </para>
+
+    <para>
+     Modern Docker installations support up to 128 layers, however older
+     versions support as few as 42.
+    </para>
+
+    <para>
+     If the produced image will not be extended by other Docker builds, it is
+     safe to set <varname>maxLayers</varname> to <literal>128</literal>.
+     However it will be impossible to extend the image further.
+    </para>
+
+    <para>
+     The first (<literal>maxLayers-2</literal>) most "popular" paths will have
+     their own individual layers, then layer #<literal>maxLayers-1</literal>
+     will contain all the remaining "unpopular" paths, and finally layer
+     #<literal>maxLayers</literal> will contain the Image configuration.
+    </para>
+
+    <para>
+     Docker's Layers are not inherently ordered, they are content-addressable
+     and are not explicitly layered until they are composed in to an Image.
+    </para>
+   </section>
+  </section>
+
  <section xml:id="ssec-pkgs-dockerTools-fetchFromRegistry">
   <title>pullImage</title>

--- a/nixos/tests/docker-tools.nix
+++ b/nixos/tests/docker-tools.nix
@ -58,5 +58,9 @@ import ./make-test.nix ({ pkgs, ... }: {
      # Ensure Docker images can use an unstable date
      $docker->succeed("docker load --input='${pkgs.dockerTools.examples.bash}'");
      $docker->succeed("[ '1970-01-01T00:00:01Z' != \"\$(docker inspect ${pkgs.dockerTools.examples.unstableDate.imageName} | ${pkgs.jq}/bin/jq -r .[].Created)\" ]");
+
+      # Ensure Layered Docker images work
+      $docker->succeed("docker load --input='${pkgs.dockerTools.examples.layered-image}'");
+      $docker->succeed("docker run --rm ${pkgs.dockerTools.examples.layered-image.imageName}");
    '';
 })
--- a/pkgs/build-support/docker/default.nix
+++ b/pkgs/build-support/docker/default.nix
@ -1,4 +1,5 @@
 {
+  symlinkJoin,
  coreutils,
  docker,
  e2fsprogs,
@ -19,6 +20,7 @@
  utillinux,
  vmTools,
  writeReferencesToFile,
+  referencesByPopularity,
  writeScript,
  writeText,
 }:
@ -77,7 +79,9 @@ rec {
    ln -sT ${docker.src}/components/engine/pkg/tarsum src/github.com/docker/docker/pkg/tarsum
    go build

-    cp tarsum $out
+    mkdir -p $out/bin
+
+    cp tarsum $out/bin/
  '';

  # buildEnv creates symlinks to dirs, which is hard to edit inside the overlay VM
@ -270,6 +274,81 @@ rec {
      perl ${pkgs.pathsFromGraph} closure-* > $out/storePaths
    '';

+  # Create $maxLayers worth of Docker Layers, one layer per store path
+  # unless there are more paths than $maxLayers. In that case, create
+  # $maxLayers-1 for the most popular layers, and smush the remainaing
+  # store paths in to one final layer.
+  mkManyPureLayers = {
+    name,
+    # Files to add to the layer.
+    closure,
+    configJson,
+    # Docker has a 42-layer maximum, we pick 24 to ensure there is plenty
+    # of room for extension
+    maxLayers ? 24
+  }:
+    runCommand "${name}-granular-docker-layers" {
+      inherit maxLayers;
+      paths = referencesByPopularity closure;
+      buildInputs = [ jshon rsync tarsum ];
+      enableParallelBuilding = true;
+    }
+    ''
+      # Delete impurities for store path layers, so they don't get
+      # shared and taint other projects.
+      cat ${configJson} \
+        | jshon -d config \
+        | jshon -s "1970-01-01T00:00:01Z" -i created > generic.json
+
+      # WARNING!
+      # The following code is fiddly w.r.t. ensuring every layer is
+      # created, and that no paths are missed. If you change the
+      # following head and tail call lines, double-check that your
+      # code behaves properly when the number of layers equals:
+      #      maxLayers-1, maxLayers, and maxLayers+1
+      head -n $((maxLayers - 1)) $paths | cat -n | xargs -P$NIX_BUILD_CORES -n2 ${./store-path-to-layer.sh}
+      if [ $(cat $paths | wc -l) -ge $maxLayers ]; then
+        tail -n+$maxLayers $paths | xargs ${./store-path-to-layer.sh} $maxLayers
+      fi
+
+      echo "Finished building layer '$name'"
+
+      mv ./layers $out
+    '';
+
+  # Create a "Customisation" layer which adds symlinks at the root of
+  # the image to the root paths of the closure. Also add the config
+  # data like what command to run and the environment to run it in.
+  mkCustomisationLayer = {
+    name,
+    # Files to add to the layer.
+    contents,
+    baseJson,
+    uid ? 0, gid ? 0,
+  }:
+    runCommand "${name}-customisation-layer" {
+      buildInputs = [ jshon rsync tarsum ];
+    }
+    ''
+      cp -r ${contents}/ ./layer
+
+      # Tar up the layer and throw it into 'layer.tar'.
+      echo "Packing layer..."
+      mkdir $out
+      tar -C layer --sort=name --mtime="@$SOURCE_DATE_EPOCH" --owner=${toString uid} --group=${toString gid} -cf $out/layer.tar .
+
+      # Compute a checksum of the tarball.
+      echo "Computing layer checksum..."
+      tarhash=$(tarsum < $out/layer.tar)
+
+      # Add a 'checksum' field to the JSON, with the value set to the
+      # checksum of the tarball.
+      cat ${baseJson} | jshon -s "$tarhash" -i checksum > $out/json
+
+      # Indicate to docker that we're using schema version 1.0.
+      echo -n "1.0" > $out/VERSION
+    '';
+
  # Create a "layer" (set of files).
  mkPureLayer = {
    # Name of the layer
@ -287,7 +366,7 @@ rec {
  }:
    runCommand "docker-layer-${name}" {
      inherit baseJson contents extraCommands;
-      buildInputs = [ jshon rsync ];
+      buildInputs = [ jshon rsync tarsum ];
    }
    ''
      mkdir layer
@ -314,11 +393,11 @@ rec {

      # Compute a checksum of the tarball.
      echo "Computing layer checksum..."
-      tarsum=$(${tarsum} < $out/layer.tar)
+      tarhash=$(tarsum < $out/layer.tar)

      # Add a 'checksum' field to the JSON, with the value set to the
      # checksum of the tarball.
-      cat ${baseJson} | jshon -s "$tarsum" -i checksum > $out/json
+      cat ${baseJson} | jshon -s "$tarhash" -i checksum > $out/json

      # Indicate to docker that we're using schema version 1.0.
      echo -n "1.0" > $out/VERSION
@ -402,8 +481,8 @@ rec {

        # Compute the tar checksum and add it to the output json.
        echo "Computing checksum..."
-        ts=$(${tarsum} < $out/layer.tar)
-        cat ${baseJson} | jshon -s "$ts" -i checksum > $out/json
+        tarhash=$(${tarsum}/bin/tarsum < $out/layer.tar)
+        cat ${baseJson} | jshon -s "$tarhash" -i checksum > $out/json
        # Indicate to docker that we're using schema version 1.0.
        echo -n "1.0" > $out/VERSION

@ -411,6 +490,104 @@ rec {
      '';
    };

+  buildLayeredImage = {
+    # Image Name
+    name,
+    # Image tag, the Nix's output hash will be used if null
+    tag ? null,
+    # Files to put on the image (a nix store path or list of paths).
+    contents ? [],
+    # Docker config; e.g. what command to run on the container.
+    config ? {},
+    # Time of creation of the image. Passing "now" will make the
+    # created date be the time of building.
+    created ? "1970-01-01T00:00:01Z",
+    # Docker's lowest maximum layer limit is 42-layers for an old
+    # version of the AUFS graph driver. We pick 24 to ensure there is
+    # plenty of room for extension. I believe the actual maximum is
+    # 128.
+    maxLayers ? 24
+  }:
+    let
+      uid = 0;
+      gid = 0;
+      baseName = baseNameOf name;
+      contentsEnv = symlinkJoin { name = "bulk-layers"; paths = (if builtins.isList contents then contents else [ contents ]); };
+
+      configJson = let
+          pure = writeText "${baseName}-config.json" (builtins.toJSON {
+            inherit created config;
+            architecture = "amd64";
+            os = "linux";
+          });
+          impure = runCommand "${baseName}-standard-dynamic-date.json"
+            { buildInputs = [ jq ]; }
+            ''
+               jq ".created = \"$(TZ=utc date --iso-8601="seconds")\"" ${pure} > $out
+            '';
+        in if created == "now" then impure else pure;
+
+      bulkLayers = mkManyPureLayers {
+          name = baseName;
+          closure = writeText "closure" "${contentsEnv} ${configJson}";
+          # One layer will be taken up by the customisationLayer, so
+          # take up one less.
+          maxLayers = maxLayers - 1;
+          inherit configJson;
+        };
+      customisationLayer = mkCustomisationLayer {
+          name = baseName;
+          contents = contentsEnv;
+          baseJson = configJson;
+          inherit uid gid;
+        };
+      result = runCommand "docker-image-${baseName}.tar.gz" {
+        buildInputs = [ jshon pigz coreutils findutils jq ];
+        # Image name and tag must be lowercase
+        imageName = lib.toLower name;
+        imageTag = if tag == null then "" else lib.toLower tag;
+        baseJson = configJson;
+      } ''
+        ${lib.optionalString (tag == null) ''
+          outName="$(basename "$out")"
+          outHash=$(echo "$outName" | cut -d - -f 1)
+
+          imageTag=$outHash
+        ''}
+
+        find ${bulkLayers} -mindepth 1 -maxdepth 1 | sort -t/ -k5 -n > layer-list
+        echo ${customisationLayer} >> layer-list
+
+        mkdir image
+        imageJson=$(cat ${configJson} | jq ". + {\"rootfs\": {\"diff_ids\": [], \"type\": \"layers\"}}")
+        manifestJson=$(jq -n "[{\"RepoTags\":[\"$imageName:$imageTag\"]}]")
+        for layer in $(cat layer-list); do
+          layerChecksum=$(sha256sum $layer/layer.tar | cut -d ' ' -f1)
+          layerID=$(sha256sum "$layer/json" | cut -d ' ' -f 1)
+          ln -s "$layer" "./image/$layerID"
+
+          manifestJson=$(echo "$manifestJson" | jq ".[0].Layers |= [\"$layerID/layer.tar\"] + .")
+          imageJson=$(echo "$imageJson" | jq ".history |= [{\"created\": \"$(jq -r .created ${configJson})\"}] + .")
+          imageJson=$(echo "$imageJson" | jq ".rootfs.diff_ids |= [\"sha256:$layerChecksum\"] + .")
+        done
+        imageJsonChecksum=$(echo "$imageJson" | sha256sum | cut -d ' ' -f1)
+        echo "$imageJson" > "image/$imageJsonChecksum.json"
+        manifestJson=$(echo "$manifestJson" | jq ".[0].Config = \"$imageJsonChecksum.json\"")
+        echo "$manifestJson" > image/manifest.json
+
+        jshon -n object \
+          -n object -s "$layerID" -i "$imageTag" \
+          -i "$imageName" > image/repositories
+
+        echo "Cooking the image..."
+        tar -C image --dereference --hard-dereference --sort=name --mtime="@$SOURCE_DATE_EPOCH" --owner=0 --group=0  --mode=a-w --xform s:'^./':: -c . | pigz -nT > $out
+
+        echo "Finished."
+      '';
+
+    in
+    result;
+
  # 1. extract the base image
  # 2. create the layer
  # 3. add layer deps to the layer itself, diffing with the base image
--- a/pkgs/build-support/docker/examples.nix
+++ b/pkgs/build-support/docker/examples.nix
@ -150,4 +150,11 @@ rec {
    contents = [ pkgs.coreutils ];
    created = "now";
  };
+
+  # 10. Create a layered image
+  layered-image = pkgs.dockerTools.buildLayeredImage {
+    name = "layered-image";
+    tag = "latest";
+    config.Cmd = [ "${pkgs.hello}/bin/hello" ];
+  };
 }
--- a/pkgs/build-support/docker/store-path-to-layer.sh
+++ b/pkgs/build-support/docker/store-path-to-layer.sh
@ -0,0 +1,24 @@
+#!/bin/sh
+
+set -eu
+
+layerNumber=$1
+shift
+
+layerPath="./layers/$layerNumber"
+echo "Creating layer #$layerNumber for $@"
+
+mkdir -p "$layerPath"
+tar -rpf "$layerPath/layer.tar" --hard-dereference --sort=name \
+    --mtime="@$SOURCE_DATE_EPOCH" \
+    --owner=0 --group=0 "$@"
+
+# Compute a checksum of the tarball.
+tarhash=$(tarsum < $layerPath/layer.tar)
+
+# Add a 'checksum' field to the JSON, with the value set to the
+# checksum of the tarball.
+cat ./generic.json | jshon -s "$tarhash" -i checksum > $layerPath/json
+
+# Indicate to docker that we're using schema version 1.0.
+echo -n "1.0" > $layerPath/VERSION
--- a/pkgs/build-support/references-by-popularity/closure-graph.py
+++ b/pkgs/build-support/references-by-popularity/closure-graph.py
@ -0,0 +1,520 @@
+# IMPORTANT: Making changes?
+#
+# Validate your changes with python3 ./closure-graph.py --test
+
+
+# Using a simple algorithm, convert the references to a path in to a
+# sorted list of dependent paths based on how often they're referenced
+# and how deep in the tree they live. Equally-"popular" paths are then
+# sorted by name.
+#
+# The existing writeReferencesToFile prints the paths in a simple
+# ascii-based sorting of the paths.
+#
+# Sorting the paths by graph improves the chances that the difference
+# between two builds appear near the end of the list, instead of near
+# the beginning. This makes a difference for Nix builds which export a
+# closure for another program to consume, if that program implements its
+# own level of binary diffing.
+#
+# For an example, Docker Images. If each store path is a separate layer
+# then Docker Images can be very efficiently transfered between systems,
+# and we get very good cache reuse between images built with the same
+# version of Nixpkgs. However, since Docker only reliably supports a
+# small number of layers (42) it is important to pick the individual
+# layers carefully. By storing very popular store paths in the first 40
+# layers, we improve the chances that the next Docker image will share
+# many of those layers.*
+#
+# Given the dependency tree:
+#
+#     A - B - C - D -\
+#      \   \   \      \
+#       \   \   \      \
+#        \   \ - E ---- F
+#         \- G
+#
+# Nodes which have multiple references are duplicated:
+#
+#     A - B - C - D - F
+#      \   \   \
+#       \   \   \- E - F
+#        \   \
+#         \   \- E - F
+#          \
+#           \- G
+#
+# Each leaf node is now replaced by a counter defaulted to 1:
+#
+#     A - B - C - D - (F:1)
+#      \   \   \
+#       \   \   \- E - (F:1)
+#        \   \
+#         \   \- E - (F:1)
+#          \
+#           \- (G:1)
+#
+# Then each leaf counter is merged with its parent node, replacing the
+# parent node with a counter of 1, and each existing counter being
+# incremented by 1. That is to say `- D - (F:1)` becomes `- (D:1, F:2)`:
+#
+#     A - B - C - (D:1, F:2)
+#      \   \   \
+#       \   \   \- (E:1, F:2)
+#        \   \
+#         \   \- (E:1, F:2)
+#          \
+#           \- (G:1)
+#
+# Then each leaf counter is merged with its parent node again, merging
+# any counters, then incrementing each:
+#
+#     A - B - (C:1, D:2, E:2, F:5)
+#      \   \
+#       \   \- (E:1, F:2)
+#        \
+#         \- (G:1)
+#
+# And again:
+#
+#     A - (B:1, C:2, D:3, E:4, F:8)
+#      \
+#       \- (G:1)
+#
+# And again:
+#
+#     (A:1, B:2, C:3, D:4, E:5, F:9, G:2)
+#
+# and then paths have the following "popularity":
+#
+#     A     1
+#     B     2
+#     C     3
+#     D     4
+#     E     5
+#     F     9
+#     G     2
+#
+# and the popularity contest would result in the paths being printed as:
+#
+#     F
+#     E
+#     D
+#     C
+#     B
+#     G
+#     A
+#
+# * Note: People who have used a Dockerfile before assume Docker's
+# Layers are inherently ordered. However, this is not true -- Docker
+# layers are content-addressable and are not explicitly layered until
+# they are composed in to an Image.
+
+import sys
+import json
+import unittest
+
+from pprint import pprint
+from collections import defaultdict
+
+# Find paths in the original dataset which are never referenced by
+# any other paths
+def find_roots(closures):
+    roots = [];
+
+    for closure in closures:
+        path = closure['path']
+        if not any_refer_to(path, closures):
+            roots.append(path)
+
+    return roots
+
+class TestFindRoots(unittest.TestCase):
+    def test_find_roots(self):
+        self.assertCountEqual(
+            find_roots([
+                {
+                    "path": "/nix/store/foo",
+                    "references": [
+                        "/nix/store/foo",
+                        "/nix/store/bar"
+                    ]
+                },
+                {
+                    "path": "/nix/store/bar",
+                    "references": [
+                        "/nix/store/bar",
+                        "/nix/store/tux"
+                    ]
+                },
+                {
+                    "path": "/nix/store/hello",
+                    "references": [
+                    ]
+                }
+            ]),
+            ["/nix/store/foo", "/nix/store/hello"]
+        )
+
+
+def any_refer_to(path, closures):
+    for closure in closures:
+        if path != closure['path']:
+            if path in closure['references']:
+                return True
+    return False
+
+class TestAnyReferTo(unittest.TestCase):
+    def test_has_references(self):
+        self.assertTrue(
+            any_refer_to(
+                "/nix/store/bar",
+                [
+                    {
+                        "path": "/nix/store/foo",
+                        "references": [
+                            "/nix/store/bar"
+                        ]
+                    },
+                ]
+            ),
+        )
+    def test_no_references(self):
+        self.assertFalse(
+            any_refer_to(
+                "/nix/store/foo",
+                [
+                    {
+                        "path": "/nix/store/foo",
+                        "references": [
+                            "/nix/store/foo",
+                            "/nix/store/bar"
+                        ]
+                    },
+                ]
+            ),
+        )
+
+def all_paths(closures):
+    paths = []
+    for closure in closures:
+        paths.append(closure['path'])
+        paths.extend(closure['references'])
+    paths.sort()
+    return list(set(paths))
+
+
+class TestAllPaths(unittest.TestCase):
+    def test_returns_all_paths(self):
+        self.assertCountEqual(
+            all_paths([
+                {
+                    "path": "/nix/store/foo",
+                    "references": [
+                        "/nix/store/foo",
+                        "/nix/store/bar"
+                    ]
+                },
+                {
+                    "path": "/nix/store/bar",
+                    "references": [
+                        "/nix/store/bar",
+                        "/nix/store/tux"
+                    ]
+                },
+                {
+                    "path": "/nix/store/hello",
+                    "references": [
+                    ]
+                }
+            ]),
+            ["/nix/store/foo", "/nix/store/bar", "/nix/store/hello", "/nix/store/tux",]
+        )
+    def test_no_references(self):
+        self.assertFalse(
+            any_refer_to(
+                "/nix/store/foo",
+                [
+                    {
+                        "path": "/nix/store/foo",
+                        "references": [
+                            "/nix/store/foo",
+                            "/nix/store/bar"
+                        ]
+                    },
+                ]
+            ),
+        )
+
+# Convert:
+#
+# [
+#    { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] },
+#    { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] },
+#    { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] },
+#    { path: /nix/store/tux, references: [ /nix/store/tux ] }
+#  ]
+#
+# To:
+#    {
+#      /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
+#      /nix/store/bar: [ /nix/store/baz ],
+#      /nix/store/baz: [ /nix/store/tux ] },
+#      /nix/store/tux: [ ]
+#    }
+#
+# Note that it drops self-references to avoid loops.
+def make_lookup(closures):
+    lookup = {}
+
+    for closure in closures:
+        # paths often self-refer
+        nonreferential_paths = [ref for ref in closure['references'] if ref != closure['path']]
+        lookup[closure['path']] = nonreferential_paths
+
+    return lookup
+
+class TestMakeLookup(unittest.TestCase):
+    def test_returns_lookp(self):
+        self.assertDictEqual(
+            make_lookup([
+                {
+                    "path": "/nix/store/foo",
+                    "references": [
+                        "/nix/store/foo",
+                        "/nix/store/bar"
+                    ]
+                },
+                {
+                    "path": "/nix/store/bar",
+                    "references": [
+                        "/nix/store/bar",
+                        "/nix/store/tux"
+                    ]
+                },
+                {
+                    "path": "/nix/store/hello",
+                    "references": [
+                    ]
+                }
+            ]),
+            {
+                "/nix/store/foo": [ "/nix/store/bar" ],
+                "/nix/store/bar": [ "/nix/store/tux" ],
+                "/nix/store/hello": [ ],
+            }
+        )
+
+# Convert:
+#
+# /nix/store/foo with
+#  {
+#    /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
+#    /nix/store/bar: [ /nix/store/baz ],
+#    /nix/store/baz: [ /nix/store/tux ] },
+#    /nix/store/tux: [ ]
+#  }
+#
+# To:
+#
+# {
+#   /nix/store/bar: {
+#                    /nix/store/baz: {
+#                                     /nix/store/tux: {}
+#                    }
+#   },
+#   /nix/store/baz: {
+#                   /nix/store/tux: {}
+#   }
+# }
+def make_graph_segment_from_root(root, lookup):
+    children = {}
+    for ref in lookup[root]:
+        children[ref] = make_graph_segment_from_root(ref, lookup)
+    return children
+
+class TestMakeGraphSegmentFromRoot(unittest.TestCase):
+    def test_returns_graph(self):
+        self.assertDictEqual(
+            make_graph_segment_from_root("/nix/store/foo", {
+                "/nix/store/foo": [ "/nix/store/bar" ],
+                "/nix/store/bar": [ "/nix/store/tux" ],
+                "/nix/store/tux": [ ],
+                "/nix/store/hello": [ ],
+            }),
+            {
+                "/nix/store/bar": {
+                    "/nix/store/tux": {}
+                }
+            }
+        )
+    def test_returns_graph_tiny(self):
+        self.assertDictEqual(
+            make_graph_segment_from_root("/nix/store/tux", {
+                "/nix/store/foo": [ "/nix/store/bar" ],
+                "/nix/store/bar": [ "/nix/store/tux" ],
+                "/nix/store/tux": [ ],
+            }),
+            {}
+        )
+
+# Convert a graph segment in to a popularity-counted dictionary:
+#
+# From:
+# {
+#    /nix/store/foo: {
+#                      /nix/store/bar: {
+#                                        /nix/store/baz: {
+#                                                           /nix/store/tux: {}
+#                                        }
+#                      }
+#                      /nix/store/baz: {
+#                                         /nix/store/tux: {}
+#                      }
+#    }
+# }
+#
+# to:
+# [
+#   /nix/store/foo: 1
+#   /nix/store/bar: 2
+#   /nix/store/baz: 4
+#   /nix/store/tux: 6
+# ]
+def graph_popularity_contest(full_graph):
+    popularity = defaultdict(int)
+    for path, subgraph in full_graph.items():
+        popularity[path] += 1
+        subcontest = graph_popularity_contest(subgraph)
+        for subpath, subpopularity in subcontest.items():
+            popularity[subpath] += subpopularity + 1
+
+    return popularity
+
+class TestGraphPopularityContest(unittest.TestCase):
+    def test_counts_popularity(self):
+        self.assertDictEqual(
+            graph_popularity_contest({
+                "/nix/store/foo": {
+                    "/nix/store/bar": {
+                        "/nix/store/baz": {
+                            "/nix/store/tux": {}
+                        }
+                    },
+                    "/nix/store/baz": {
+                        "/nix/store/tux": {}
+                    }
+                }
+            }),
+            {
+                   "/nix/store/foo": 1,
+                   "/nix/store/bar": 2,
+                   "/nix/store/baz": 4,
+                   "/nix/store/tux": 6,
+            }
+        )
+
+# Emit a list of packages by popularity, most first:
+#
+# From:
+# [
+#   /nix/store/foo: 1
+#   /nix/store/bar: 1
+#   /nix/store/baz: 2
+#   /nix/store/tux: 2
+# ]
+#
+# To:
+# [ /nix/store/baz /nix/store/tux /nix/store/bar /nix/store/foo ]
+def order_by_popularity(paths):
+    paths_by_popularity = defaultdict(list)
+    popularities = []
+    for path, popularity in paths.items():
+        popularities.append(popularity)
+        paths_by_popularity[popularity].append(path)
+
+    popularities = list(set(popularities))
+    popularities.sort()
+
+    flat_ordered = []
+    for popularity in popularities:
+        paths = paths_by_popularity[popularity]
+        paths.sort(key=package_name)
+
+        flat_ordered.extend(reversed(paths))
+    return list(reversed(flat_ordered))
+
+
+class TestOrderByPopularity(unittest.TestCase):
+    def test_returns_in_order(self):
+        self.assertEqual(
+            order_by_popularity({
+                   "/nix/store/foo": 1,
+                   "/nix/store/bar": 1,
+                   "/nix/store/baz": 2,
+                   "/nix/store/tux": 2,
+            }),
+            [
+                "/nix/store/baz",
+                "/nix/store/tux",
+                "/nix/store/bar",
+                "/nix/store/foo"
+            ]
+        )
+
+def package_name(path):
+    parts = path.split('-')
+    start = parts.pop(0)
+    # don't throw away any data, so the order is always the same.
+    # even in cases where only the hash at the start has changed.
+    parts.append(start)
+    return '-'.join(parts)
+
+def main():
+    filename = sys.argv[1]
+    key = sys.argv[2]
+
+    with open(filename) as f:
+        data = json.load(f)
+
+    # Data comes in as:
+    # [
+    #    { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] },
+    #    { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] },
+    #    { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] },
+    #    { path: /nix/store/tux, references: [ /nix/store/tux ] }
+    #  ]
+    #
+    # and we want to get out a list of paths ordered by how universally,
+    # important they are, ie: tux is referenced by every path, transitively
+    # so it should be #1
+    #
+    # [
+    #   /nix/store/tux,
+    #   /nix/store/baz,
+    #   /nix/store/bar,
+    #   /nix/store/foo,
+    # ]
+    graph = data[key]
+
+    roots = find_roots(graph);
+    lookup = make_lookup(graph)
+
+    full_graph = {}
+    for root in roots:
+        full_graph[root] = make_graph_segment_from_root(root, lookup)
+
+    ordered = order_by_popularity(graph_popularity_contest(full_graph))
+    missing = []
+    for path in all_paths(graph):
+        if path not in ordered:
+            missing.append(path)
+
+    ordered.extend(missing)
+    print("\n".join(ordered))
+
+if "--test" in sys.argv:
+    # Don't pass --test otherwise unittest gets mad
+    unittest.main(argv = [f for f in sys.argv if f != "--test" ])
+else:
+    main()
--- a/pkgs/build-support/references-by-popularity/default.nix
+++ b/pkgs/build-support/references-by-popularity/default.nix
@ -0,0 +1,15 @@
+{ runCommand, python3, coreutils }:
+# Write the references of `path' to a file, in order of how "popular" each
+# reference is. Nix 2 only.
+path: runCommand "closure-paths"
+{
+  exportReferencesGraph.graph = path;
+  __structuredAttrs = true;
+  PATH = "${coreutils}/bin:${python3}/bin";
+  builder = builtins.toFile "builder"
+    ''
+      . .attrs.sh
+      python3 ${./closure-graph.py} .attrs.json graph > ''${outputs[out]}
+    '';
+  }
+  ""
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@ -365,6 +365,8 @@ with pkgs;

  nukeReferences = callPackage ../build-support/nuke-references { };

+  referencesByPopularity = callPackage ../build-support/references-by-popularity { };
+
  removeReferencesTo = callPackage ../build-support/remove-references-to { };

  vmTools = callPackage ../build-support/vm { };