Skip to content

Commit

Permalink
=prootype for loading the binary file
Browse files Browse the repository at this point in the history
  • Loading branch information
oxinabox committed Jun 8, 2018
1 parent 83bc5a6 commit 3bd0ab2
Showing 1 changed file with 338 additions and 0 deletions.
338 changes: 338 additions & 0 deletions src/proto.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,338 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"using PretrainedEmbeddings\n",
"\n",
"using DataDeps"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"FastText fr CommonCrawl Binary/cc.fr.300.bin\""
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dd_name = language_files(PretrainedEmbeddings.FastText_Bin{:fr}) |> first"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"StatStruct(mode=0o100644, size=7238894263)"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stat"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#=\n",
"struct entry {\n",
" std::string word;\n",
" int64_t count;\n",
" entry_type type;\n",
" std::vector<int32_t> subwords;\n",
"};\n",
" #="
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"#https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1-element Array{String,1}:\n",
" \"cc.fr.300.bin\""
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"readdir(datadep\"FastText fr CommonCrawl Binary\")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Entry"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"@enum EntryType::Int8 word_type=0 label_type=1\n",
"\n",
"struct Entry\n",
" word::String\n",
" count::Int64\n",
" entry_type:: EntryType\n",
" subwords::Vector{Int32}\n",
"end\n",
"Entry()=Entry(\"\", 0, word_type, Int32[])\n"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"magic = read(fh, Int32) = 793712314\n",
"version = read(fh, Int32) = 12\n",
"\n",
"args_dim = read(fh, Int32) = 300\n",
"args_ws = read(fh, Int32) = 5\n",
"args_epoch = read(fh, Int32) = 1\n",
"args_minCount = read(fh, Int32) = 5\n",
"args_neg = read(fh, Int32) = 10\n",
"args_wordNgrams = read(fh, Int32) = 1\n",
"args_loss = read(fh, Int32) = 2\n",
"args_model = read(fh, Int32) = 1\n",
"args_bucket = read(fh, Int32) = 2000000\n",
"args_minn = read(fh, Int32) = 5\n",
"args_maxn = read(fh, Int32) = 5\n",
"args_lrUpdateRate = read(fh, Int32) = 100\n",
"args_t = read(fh, Float64) = 9.999999747378752e-6\n",
"\n",
"size_ = read(fh, Int32) = 2000000\n",
"nwords = read(fh, Int32) = 2000000\n",
"nlabels = read(fh, Int32) = 0\n",
"ntokens = read(fh, Int64) = 68358270953\n",
"pruneidx_size_ = read(fh, Int64) = -1\n",
"\n",
"length(words_) = 2000000\n",
"words_[1] = Entry(\",\", 2854010684, word_type::EntryType = 0, Int32[])\n",
"words_[2] = Entry(\"de\", 2742946523, word_type::EntryType = 0, Int32[])\n",
"words_[3] = Entry(\".\", 1675680641, word_type::EntryType = 0, Int32[])\n",
"words_[end - 1] = Entry(\"Fautereau\", 235, word_type::EntryType = 0, Int32[])\n",
"words_[end] = Entry(\"IdealCoque\", 235, word_type::EntryType = 0, Int32[])\n",
"\n",
"\n",
"quant_input = read(fh, Bool) = false\n",
"m_ = read(fh, Int64) = 4000000\n",
"n_ = read(fh, Int64) = 300\n",
"(typeof(data), size(data)) = (Array{Float32,2}, (4000000, 300))\n",
"quant_output = read(fh, Bool) = false\n",
"m_ = read(fh, Int64) = 2000000\n",
"n_ = read(fh, Int64) = 300\n",
"(typeof(data), size(data)) = (Array{Float32,2}, (2000000, 300))\n"
]
}
],
"source": [
"const FASTTEXT_VERSION = Int32(12); # Version 1b \n",
"const FASTTEXT_FILEFORMAT_MAGIC_INT32 = Int32(793712314);\n",
"\n",
"\n",
"function load_header(fh)\n",
"\t### Check Model\n",
" @show magic = read(fh, Int32)\n",
" @assert magic== FASTTEXT_FILEFORMAT_MAGIC_INT32\n",
" @show version = read(fh, Int32)\n",
" @assert version == FASTTEXT_VERSION\n",
" println()\n",
"end\n",
"\n",
"function load_args(fh)\n",
" ## Load Args https://github.com/facebookresearch/fastText/blob/master/src/args.cc#L261\n",
" @show args_dim = read(fh, Int32)\n",
" @show args_ws = read(fh, Int32)\n",
" @show args_epoch = read(fh, Int32)\n",
" @show args_minCount = read(fh, Int32)\n",
" @show args_neg = read(fh, Int32)\n",
" @show args_wordNgrams = read(fh, Int32)\n",
" @show args_loss = read(fh, Int32)\n",
" @show args_model = read(fh, Int32)\n",
" @show args_bucket = read(fh, Int32)\n",
" @show args_minn = read(fh, Int32)\n",
" @show args_maxn = read(fh, Int32)\n",
" @show args_lrUpdateRate = read(fh, Int32)\n",
" @show args_t = read(fh, Float64)\n",
" println()\n",
"end\n",
"\n",
"function load_dict(fh)\n",
" ## Load model dict, https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc#L419 \n",
" @show size_ = read(fh, Int32)\n",
" @show nwords = read(fh, Int32)\n",
" @show nlabels = read(fh, Int32)\n",
" @show ntokens = read(fh, Int64)\n",
" @show pruneidx_size_ = read(fh, Int64)\n",
" \n",
" println()\n",
" words_ = map(1:size_) do ii\n",
" e_word=readuntil(fh, '\\0')[1:end-1]\n",
" e_count=read(fh, Int64)\n",
" e_entry_type=read(fh, EntryType)\n",
" Entry(e_word, e_count, e_entry_type, Int32[])\n",
" end\n",
" @show length(words_)\n",
" @show words_[1]\n",
" @show words_[2]\n",
" @show words_[3]\n",
" @show words_[end-1]\n",
" @show words_[end]\n",
" println()\n",
" @assert pruneidx_size_ < 0 \n",
" # Avoid loading this stuff https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc#L437\n",
" println()\n",
"\t\n",
"\twords_\n",
"end\n",
"\n",
"function load_matrix(fh)\n",
" ### Load Matrix\n",
" #https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc#L114\n",
" \n",
" @show m_ = read(fh, Int64)\n",
" @show n_ = read(fh, Int64)\n",
" data = read(fh, Float32, (m_, n_)) # Note `real` is a typedef for `float32`\n",
" @show typeof(data), size(data)\n",
"\tdata\n",
"end\n",
"\n",
"function load_fasttext_bin(filename)\n",
"\topen(filename) do fh\n",
"\t\tload_header(fh)\n",
"\t\tload_args(fh)\n",
"\t\tload_dict(fh)\n",
"\t\t\n",
"\t\t\n",
"\t\t@show quant_input = read(fh, Bool)\n",
"\t\t@assert !quant_input # avoid that stuff\n",
"\t\tinput_ = load_matrix(fh)\n",
"\t\t\n",
"\t\t@show quant_output = read(fh, Bool)\n",
"\t\t@assert !quant_output # avoid that stuff\n",
"\t\toutput_ = load_matrix(fh)\n",
"\t\t\n",
" @assert(eof(fh))\n",
"\tend\n",
"end\n",
"\n",
"\n",
"load_fasttext_bin(@datadep_str dd_name)\n"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"search: \u001b[1mr\u001b[22m\u001b[1me\u001b[22m\u001b[1ma\u001b[22m\u001b[1md\u001b[22m\u001b[1ms\u001b[22m\u001b[1mt\u001b[22m\u001b[1mr\u001b[22m\u001b[1mi\u001b[22m\u001b[1mn\u001b[22m\u001b[1mg\u001b[22m\n",
"\n"
]
},
{
"data": {
"text/markdown": [
"```\n",
"readstring(stream::IO)\n",
"readstring(filename::AbstractString)\n",
"```\n",
"\n",
"Read the entire contents of an I/O stream or a file as a string. The text is assumed to be encoded in UTF-8.\n"
],
"text/plain": [
"```\n",
"readstring(stream::IO)\n",
"readstring(filename::AbstractString)\n",
"```\n",
"\n",
"Read the entire contents of an I/O stream or a file as a string. The text is assumed to be encoded in UTF-8.\n"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"?readstring"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Julia 0.6.2",
"language": "julia",
"name": "julia-0.6"
},
"language_info": {
"file_extension": ".jl",
"mimetype": "application/julia",
"name": "julia",
"version": "0.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 3bd0ab2

Please sign in to comment.