Skip to content

Commit

Permalink
Update to allow for platform dependent libs in CGO (#18)
Browse files Browse the repository at this point in the history
* Update to allow for platform dependent libs in CGO

* update make test
  • Loading branch information
jmoney authored Jun 12, 2024
1 parent 3a615e6 commit d503b5b
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 15 deletions.
11 changes: 6 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build:
build-example:
@docker build -f ./example/Dockerfile . -t tokenizers-example

release-darwin-%:
release-darwin-%: test
cargo build --release --target $*-apple-darwin
mkdir -p artifacts/darwin-$*
cp target/$*-apple-darwin/release/libtokenizers.a artifacts/darwin-$*/libtokenizers.a
Expand All @@ -15,10 +15,11 @@ release-darwin-%:
mkdir -p artifacts/all
cp artifacts/darwin-$*/libtokenizers.darwin-$*.tar.gz artifacts/all/libtokenizers.darwin-$*.tar.gz

release-linux-%:
docker buildx build --platform linux/$* -f release/Dockerfile . -t tokenizers.linux-$*
release-linux-%: test
docker buildx build --platform linux/$* --build-arg="DOCKER_TARGETPLATFORM=linux/$*" -f release/Dockerfile . -t tokenizers.linux-$*
mkdir -p artifacts/linux-$*
docker run -v $(PWD)/artifacts/linux-$*:/mnt --entrypoint cp tokenizers.linux-$* /workspace/tokenizers/libtokenizers.a /mnt/libtokenizers.a
docker run -v $(PWD)/artifacts/linux-$*:/mnt --entrypoint ls tokenizers.linux-$* /workspace/tokenizers/lib/linux
docker run -v $(PWD)/artifacts/linux-$*:/mnt --entrypoint cp tokenizers.linux-$* /workspace/tokenizers/lib/linux/$*/libtokenizers.a /mnt/libtokenizers.a
cd artifacts/linux-$* && \
tar -czf libtokenizers.linux-$*.tar.gz libtokenizers.a
mkdir -p artifacts/all
Expand All @@ -30,7 +31,7 @@ release: release-darwin-aarch64 release-darwin-x86_64 release-linux-arm64 releas
cp artifacts/all/libtokenizers.linux-x86_64.tar.gz artifacts/all/libtokenizers.linux-amd64.tar.gz

test: build
@go test -v ./... -count=1
@go test -ldflags="-extldflags '-L./'" -v ./... -count=1

clean:
rm -rf libtokenizers.a target
Expand Down
14 changes: 9 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Go bindings for the [HuggingFace Tokenizers](https://github.com/huggingface/toke
Build your Go application using pre-built native binaries: `docker build --platform=linux/amd64 -f example/Dockerfile .`

Available binaries:

* [darwin-arm64](https://github.com/daulet/tokenizers/releases/latest/download/libtokenizers.darwin-arm64.tar.gz)
* [linux-arm64](https://github.com/daulet/tokenizers/releases/latest/download/libtokenizers.linux-arm64.tar.gz)
* [linux-amd64](https://github.com/daulet/tokenizers/releases/latest/download/libtokenizers.linux-amd64.tar.gz)
Expand All @@ -20,6 +21,7 @@ Available binaries:
TLDR: [working example](example/main.go).

Load a tokenizer from a JSON config:

```go
import "github.com/daulet/tokenizers"

Expand All @@ -32,6 +34,7 @@ defer tk.Close()
```

Encode text and decode tokens:

```go
fmt.Println("Vocab size:", tk.VocabSize())
// Vocab size: 30522
Expand All @@ -44,18 +47,19 @@ fmt.Println(tk.Decode([]uint32{2829, 4419, 14523, 2058, 1996, 13971, 3899}, true
```

## Benchmarks

```bash
go test . -bench=. -benchmem -benchtime=10s

goos: darwin
goarch: arm64
pkg: github.com/daulet/tokenizers
BenchmarkEncodeNTimes-10 996556 11851 ns/op 116 B/op 6 allocs/op
BenchmarkEncodeNChars-10 1000000000 2.446 ns/op 0 B/op 0 allocs/op
BenchmarkDecodeNTimes-10 7286056 1657 ns/op 112 B/op 4 allocs/op
BenchmarkDecodeNTokens-10 65191378 211.0 ns/op 7 B/op 0 allocs/op
BenchmarkEncodeNTimes-10 996556 11851 ns/op 116 B/op 6 allocs/op
BenchmarkEncodeNChars-10 1000000000 2.446 ns/op 0 B/op 0 allocs/op
BenchmarkDecodeNTimes-10 7286056 1657 ns/op 112 B/op 4 allocs/op
BenchmarkDecodeNTokens-10 65191378 211.0 ns/op 7 B/op 0 allocs/op
PASS
ok github.com/daulet/tokenizers 126.681s
ok github.com/daulet/tokenizers 126.681s
```

## Contributing
Expand Down
2 changes: 1 addition & 1 deletion example/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ RUN curl -fsSL https://github.com/daulet/tokenizers/releases/download/${VERSION}
COPY ./example .
COPY ./test/data ./test/data
RUN go mod download
RUN mv ./libtokenizers.a /go/pkg/mod/github.com/daulet/tokenizers@${VERSION}/libtokenizers.a
RUN mv ./libtokenizers.a /go/pkg/mod/github.com/daulet/tokenizers@${VERSION}/lib/$(echo ${TARGETPLATFORM} | tr / -)/libtokenizers.a
# mounting Go cache won't work since we mutate it above
RUN go run main.go
6 changes: 3 additions & 3 deletions release/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ COPY ./Cargo.lock ./Cargo.lock
RUN cargo build --release

FROM golang:1.21 as builder-go
ARG TARGETPLATFORM
ARG DOCKER_TARGETPLATFORM
WORKDIR /workspace
COPY ./release/go.mod .
COPY ./release/main.go .
Expand All @@ -18,6 +18,6 @@ COPY tokenizer.go ./tokenizers/
COPY tokenizers.h ./tokenizers/
COPY --from=builder-rust \
/workspace/target/release/libtokenizers.a \
./tokenizers/
./tokenizers/lib/${DOCKER_TARGETPLATFORM}/
COPY ./test/data ./test/data
RUN go run .
RUN go run -ldflags="-extldflags '-L./tokenizers/lib/${DOCKER_TARGETPLATFORM}'" .
2 changes: 1 addition & 1 deletion tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package tokenizers
// TODO packaging: how do we build the rust lib for distribution?

/*
#cgo LDFLAGS: ${SRCDIR}/libtokenizers.a -ldl -lm -lstdc++
#cgo LDFLAGS: -ltokenizers -ldl -lm -lstdc++
#include <stdlib.h>
#include "tokenizers.h"
*/
Expand Down

0 comments on commit d503b5b

Please sign in to comment.