mirror of
https://github.com/NixOS/nixpkgs.git
synced 2024-12-25 03:17:13 +00:00
122 lines
3.5 KiB
Nix
122 lines
3.5 KiB
Nix
|
{ stdenv
|
||
|
, rustPlatform
|
||
|
, fetchFromGitHub
|
||
|
, fetchurl
|
||
|
, maturin
|
||
|
, pipInstallHook
|
||
|
, pytest
|
||
|
, python
|
||
|
, requests
|
||
|
}:
|
||
|
|
||
|
let
|
||
|
robertaVocab = fetchurl {
|
||
|
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
|
||
|
sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
|
||
|
};
|
||
|
robertaMerges = fetchurl {
|
||
|
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
|
||
|
sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
|
||
|
};
|
||
|
bertVocab = fetchurl {
|
||
|
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
|
||
|
sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
|
||
|
};
|
||
|
openaiVocab = fetchurl {
|
||
|
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
|
||
|
sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
|
||
|
};
|
||
|
openaiMerges = fetchurl {
|
||
|
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
|
||
|
sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
|
||
|
};
|
||
|
in rustPlatform.buildRustPackage rec {
|
||
|
pname = "tokenizers";
|
||
|
version = "0.8.0";
|
||
|
|
||
|
src = fetchFromGitHub {
|
||
|
owner = "huggingface";
|
||
|
repo = pname;
|
||
|
rev = "python-v${version}";
|
||
|
sha256 = "0f5r1wm5ybyk3jvihj1g98y7ihq0iklg0pwkaa11pk1gv0k869w3";
|
||
|
};
|
||
|
|
||
|
cargoSha256 = "131bvf35q5n65mq6zws1rp5fn2qkfwfg9sbxi5y6if24n8fpdz4m";
|
||
|
|
||
|
sourceRoot = "source/bindings/python";
|
||
|
|
||
|
nativeBuildInputs = [
|
||
|
maturin
|
||
|
pipInstallHook
|
||
|
];
|
||
|
|
||
|
propagatedBuildInputs = [
|
||
|
python
|
||
|
];
|
||
|
|
||
|
# tokenizers uses pyo3, which requires Rust nightly.
|
||
|
RUSTC_BOOTSTRAP = 1;
|
||
|
|
||
|
doCheck = false;
|
||
|
doInstallCheck = true;
|
||
|
|
||
|
postUnpack = ''
|
||
|
# Add data files for tests, otherwise tests attempt network access.
|
||
|
mkdir $sourceRoot/tests/data
|
||
|
( cd $sourceRoot/tests/data
|
||
|
ln -s ${robertaVocab} roberta-base-vocab.json
|
||
|
ln -s ${robertaMerges} roberta-base-merges.txt
|
||
|
ln -s ${bertVocab} bert-base-uncased-vocab.txt
|
||
|
ln -s ${openaiVocab} openai-gpt-vocab.json
|
||
|
ln -s ${openaiMerges} openai-gpt-merges.txt )
|
||
|
'';
|
||
|
|
||
|
postPatch = ''
|
||
|
# pyo3's build check verifies that Rust is a nightly
|
||
|
# version. Disable this check.
|
||
|
substituteInPlace $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/build.rs \
|
||
|
--replace "check_rustc_version()?;" ""
|
||
|
|
||
|
# Patching the vendored dependency invalidates the file
|
||
|
# checksums, so remove them. This should be safe, since
|
||
|
# this is just a copy of the vendored dependencies and
|
||
|
# the integrity of the vendored dependencies is validated
|
||
|
# by cargoSha256.
|
||
|
sed -r -i 's|"files":\{[^}]+\}|"files":{}|' \
|
||
|
$NIX_BUILD_TOP/$cargoDepsCopy/pyo3/.cargo-checksum.json
|
||
|
|
||
|
# Maturin uses the crate name as the wheel name.
|
||
|
substituteInPlace Cargo.toml \
|
||
|
--replace "tokenizers-python" "tokenizers"
|
||
|
'';
|
||
|
|
||
|
buildPhase = ''
|
||
|
maturin build --release --manylinux off
|
||
|
'';
|
||
|
|
||
|
installPhase = ''
|
||
|
# Put the wheels where the pip install hook can find them.
|
||
|
install -Dm644 -t dist target/wheels/*.whl
|
||
|
pipInstallPhase
|
||
|
'';
|
||
|
|
||
|
installCheckInputs = [
|
||
|
pytest
|
||
|
requests
|
||
|
];
|
||
|
|
||
|
installCheckPhase = ''
|
||
|
# Append paths, or the binding's tokenizer module will be
|
||
|
# used, since the test directories have __init__.py
|
||
|
pytest --import-mode=append
|
||
|
'';
|
||
|
|
||
|
meta = with stdenv.lib; {
|
||
|
homepage = "https://github.com/huggingface/tokenizers";
|
||
|
description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
|
||
|
license = licenses.asl20;
|
||
|
platforms = platforms.unix;
|
||
|
maintainers = with maintainers; [ danieldk ];
|
||
|
};
|
||
|
}
|