Merge branch 'master' of https://github.com/JuliaText/TextAnalysis.jl …

…into tagging_schemes_patch
JuliaText · Jun 23, 2019 · 2bae9e4 · 2bae9e4
2 parents df3c512 + 5730ba6
commit 2bae9e4
Show file tree

Hide file tree

Showing 15 changed files with 865 additions and 143 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -19,4 +19,4 @@ jobs:
         - julia --project=docs/ docs/make.jl
       after_success: skip
 after_success:
-  - julia -e 'cd(Pkg.dir("TextAnalysis")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())';
+- julia -e 'cd(Pkg.dir("TextAnalysis")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())';
diff --git a/REQUIRE b/REQUIRE
@@ -6,3 +6,4 @@ WordTokenizers
 Flux
 BSON
 JSON
+DataStructures
diff --git a/docs/src/features.md b/docs/src/features.md
@@ -251,3 +251,51 @@ julia> tags
  "I-PER"
  "E-PER"
 ```
+
+## Parts of Speech Tagger
+
+This tagger can be used to find the POS tag of a word or token in a given sentence. It is a based on `Average Perceptron Algorithm`.
+The model can be trained from scratch and weights are saved in specified location.
+The pretrained model can also be loaded and can be used directly to predict tags.
+
+### To train model:
+```julia
+julia> tagger = PerceptronTagger(false) #we can use tagger = PerceptronTagger()
+julia> fit!(tagger, [[("today","NN"),("is","VBZ"),("good","JJ"),("day","NN")]])
+iteration : 1
+iteration : 2
+iteration : 3
+iteration : 4
+iteration : 5
+```
+
+### To load pretrained model:
+```julia
+julia> tagger = PerceptronTagger(true)
+loaded successfully
+PerceptronTagger(AveragePerceptron(Set(Any["JJS", "NNP_VBZ", "NN_NNS", "CC", "NNP_NNS", "EX", "NNP_TO", "VBD_DT", "LS", ("Council", "NNP")  …  "NNPS", "NNP_LS", "VB", "NNS_NN", "NNP_SYM", "VBZ", "VBZ_JJ", "UH", "SYM", "NNP_NN", "CD"]), Dict{Any,Any}("i+2 word wetlands"=>Dict{Any,Any}("NNS"=>0.0,"JJ"=>0.0,"NN"=>0.0),"i-1 tag+i word NNP basic"=>Dict{Any,Any}("JJ"=>0.0,"IN"=>0.0),"i-1 tag+i word DT chloride"=>Dict{Any,Any}("JJ"=>0.0,"NN"=>0.0),"i-1 tag+i word NN choo"=>Dict{Any,Any}("NNP"=>0.0,"NN"=>0.0),"i+1 word antarctica"=>Dict{Any,Any}("FW"=>0.0,"NN"=>0.0),"i-1 tag+i word -START- appendix"=>Dict{Any,Any}("NNP"=>0.0,"NNPS"=>0.0,"NN"=>0.0),"i-1 word wahoo"=>Dict{Any,Any}("JJ"=>0.0,"VBD"=>0.0),"i-1 tag+i word DT children's"=>Dict{Any,Any}("NNS"=>0.0,"NN"=>0.0),"i word dnipropetrovsk"=>Dict{Any,Any}("NNP"=>0.003,"NN"=>-0.003),"i suffix hla"=>Dict{Any,Any}("JJ"=>0.0,"NN"=>0.0)…), DefaultDict{Any,Any,Int64}(), DefaultDict{Any,Any,Int64}(), 1, ["-START-", "-START2-"]), Dict{Any,Any}("is"=>"VBZ","at"=>"IN","a"=>"DT","and"=>"CC","for"=>"IN","by"=>"IN","Retrieved"=>"VBN","was"=>"VBD","He"=>"PRP","in"=>"IN"…), Set(Any["JJS", "NNP_VBZ", "NN_NNS", "CC", "NNP_NNS", "EX", "NNP_TO", "VBD_DT", "LS", ("Council", "NNP")  …  "NNPS", "NNP_LS", "VB", "NNS_NN", "NNP_SYM", "VBZ", "VBZ_JJ", "UH", "SYM", "NNP_NN", "CD"]), ["-START-", "-START2-"], ["-END-", "-END2-"], Any[])
+```
+
+### To predict tags:
+```julia
+julia> predict(tagger, ["today", "is"])
+2-element Array{Any,1}:
+ ("today", "NN")
+ ("is", "VBZ")
+```
+
+`PerceptronTagger(load::Bool)`
+
+* load      = Boolean argument if `true` then pretrained model is loaded
+
+`fit!(self::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}, save_loc::String, nr_iter::Integer)`
+
+* self      = `PerceptronTagger` object
+* sentences = `Vector` of `Vector` of `Tuple` of pair of word or token and its POS tag [see above example]
+* save_loc  = location of file to save the trained weights
+* nr_iter   = Number of iterations to pass the `sentences` to train the model ( default 5)
+
+`predict(self::PerceptronTagger, tokens)`
+
+* self      = PerceptronTagger
+* tokens    = `Vector` of words or tokens for which to predict tags
diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
@@ -53,6 +53,7 @@ module TextAnalysis
     export tag_scheme!
     export jackknife_avg, listify_ngrams, weighted_lcs, fmeasure_lcs
     export rouge_l_summary, rouge_l_sentence, rouge_n
+    export PerceptronTagger, fit!, predict
 
     include("tokenizer.jl")
     include("ngramizer.jl")
@@ -81,4 +82,5 @@ module TextAnalysis
     include("tagging_schemes.jl")
     include("utils.jl")
     include("rouge.jl")
+    include("averagePerceptronTagger.jl")
 end