forked from mirrors/nixpkgs
python3Packages.tokenizers: init at 0.8.0
This commit is contained in:
parent
0091e0358b
commit
66d0b2a0d0
121
pkgs/development/python-modules/tokenizers/default.nix
Normal file
121
pkgs/development/python-modules/tokenizers/default.nix
Normal file
|
@ -0,0 +1,121 @@
|
|||
{ stdenv
|
||||
, rustPlatform
|
||||
, fetchFromGitHub
|
||||
, fetchurl
|
||||
, maturin
|
||||
, pipInstallHook
|
||||
, pytest
|
||||
, python
|
||||
, requests
|
||||
}:
|
||||
|
||||
let
|
||||
robertaVocab = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
|
||||
sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
|
||||
};
|
||||
robertaMerges = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
|
||||
sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
|
||||
};
|
||||
bertVocab = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
|
||||
sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
|
||||
};
|
||||
openaiVocab = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
|
||||
sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
|
||||
};
|
||||
openaiMerges = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
|
||||
sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
|
||||
};
|
||||
in rustPlatform.buildRustPackage rec {
|
||||
pname = "tokenizers";
|
||||
version = "0.8.0";
|
||||
|
||||
src = fetchFromGitHub {
|
||||
owner = "huggingface";
|
||||
repo = pname;
|
||||
rev = "python-v${version}";
|
||||
sha256 = "0f5r1wm5ybyk3jvihj1g98y7ihq0iklg0pwkaa11pk1gv0k869w3";
|
||||
};
|
||||
|
||||
cargoSha256 = "131bvf35q5n65mq6zws1rp5fn2qkfwfg9sbxi5y6if24n8fpdz4m";
|
||||
|
||||
sourceRoot = "source/bindings/python";
|
||||
|
||||
nativeBuildInputs = [
|
||||
maturin
|
||||
pipInstallHook
|
||||
];
|
||||
|
||||
propagatedBuildInputs = [
|
||||
python
|
||||
];
|
||||
|
||||
# tokenizers uses pyo3, which requires Rust nightly.
|
||||
RUSTC_BOOTSTRAP = 1;
|
||||
|
||||
doCheck = false;
|
||||
doInstallCheck = true;
|
||||
|
||||
postUnpack = ''
|
||||
# Add data files for tests, otherwise tests attempt network access.
|
||||
mkdir $sourceRoot/tests/data
|
||||
( cd $sourceRoot/tests/data
|
||||
ln -s ${robertaVocab} roberta-base-vocab.json
|
||||
ln -s ${robertaMerges} roberta-base-merges.txt
|
||||
ln -s ${bertVocab} bert-base-uncased-vocab.txt
|
||||
ln -s ${openaiVocab} openai-gpt-vocab.json
|
||||
ln -s ${openaiMerges} openai-gpt-merges.txt )
|
||||
'';
|
||||
|
||||
postPatch = ''
|
||||
# pyo3's build check verifies that Rust is a nightly
|
||||
# version. Disable this check.
|
||||
substituteInPlace $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/build.rs \
|
||||
--replace "check_rustc_version()?;" ""
|
||||
|
||||
# Patching the vendored dependency invalidates the file
|
||||
# checksums, so remove them. This should be safe, since
|
||||
# this is just a copy of the vendored dependencies and
|
||||
# the integrity of the vendored dependencies is validated
|
||||
# by cargoSha256.
|
||||
sed -r -i 's|"files":\{[^}]+\}|"files":{}|' \
|
||||
$NIX_BUILD_TOP/$cargoDepsCopy/pyo3/.cargo-checksum.json
|
||||
|
||||
# Maturin uses the crate name as the wheel name.
|
||||
substituteInPlace Cargo.toml \
|
||||
--replace "tokenizers-python" "tokenizers"
|
||||
'';
|
||||
|
||||
buildPhase = ''
|
||||
maturin build --release --manylinux off
|
||||
'';
|
||||
|
||||
installPhase = ''
|
||||
# Put the wheels where the pip install hook can find them.
|
||||
install -Dm644 -t dist target/wheels/*.whl
|
||||
pipInstallPhase
|
||||
'';
|
||||
|
||||
installCheckInputs = [
|
||||
pytest
|
||||
requests
|
||||
];
|
||||
|
||||
installCheckPhase = ''
|
||||
# Append paths, or the binding's tokenizer module will be
|
||||
# used, since the test directories have __init__.py
|
||||
pytest --import-mode=append
|
||||
'';
|
||||
|
||||
meta = with stdenv.lib; {
|
||||
homepage = "https://github.com/huggingface/tokenizers";
|
||||
description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
|
||||
license = licenses.asl20;
|
||||
platforms = platforms.unix;
|
||||
maintainers = with maintainers; [ danieldk ];
|
||||
};
|
||||
}
|
|
@ -1502,6 +1502,9 @@ in {
|
|||
inherit (pkgs) sentencepiece pkgconfig;
|
||||
};
|
||||
|
||||
tokenizers = disabledIf (!isPy3k)
|
||||
(toPythonModule (callPackage ../development/python-modules/tokenizers { }));
|
||||
|
||||
transformers = callPackage ../development/python-modules/transformers { };
|
||||
|
||||
transforms3d = callPackage ../development/python-modules/transforms3d { };
|
||||
|
|
Loading…
Reference in a new issue