diff --git a/Project.toml b/Project.toml index 61e4820..39cc084 100644 --- a/Project.toml +++ b/Project.toml @@ -1,12 +1,13 @@ name = "WordCloud" uuid = "6385f0a0-cb03-45b6-9089-4e0acc74b26b" authors = ["guoyongzhi "] -version = "0.13.3" +version = "0.13.4" [deps] ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4" Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" +Fontconfig = "186bb1d3-e1f7-5a2c-a377-96d770f13627" ImageTransformations = "02fcd773-0e25-5acc-982a-7f6622650795" LanguageIdentification = "35248bf2-58a5-46a0-9b1d-c0a73a50a105" Luxor = "ae8d54c2-7ccd-5906-9d76-62fc9837b5bc" @@ -20,6 +21,7 @@ Stuffing = "4175e07e-e5b7-423e-8796-3ea7f6d48281" ColorSchemes = "3" Colors = "0.9, 0.10, 0.11, 0.12" FileIO = "1" +Fontconfig = "0.4" ImageTransformations = "0.5, 0.6, 0.7, 0.8, 0.9, 0.10" LanguageIdentification = "1.0.1" Luxor = "3.6, 4" diff --git a/WordCloudApp.jl b/WordCloudApp.jl index 332dd99..d220cc4 100644 --- a/WordCloudApp.jl +++ b/WordCloudApp.jl @@ -379,26 +379,8 @@ begin end -# ╔═╡ b09620ef-4495-4c83-ad1c-2d8b0ed70710 -begin - google_fonts = ["Roboto", "Open Sans", "Lato", "Montserrat", "Noto Sans JP", "Roboto Condensed", "Oswald", "Source Sans Pro", "Slabo 27px", "Raleway", "PT Sans", "Poppins", "Roboto Slab", "Merriweather", "Noto Sans", "Ubuntu", "Roboto Mono", "Lora", "Playfair Display", "Nunito", "PT Serif", "Titillium Web", "PT Sans Narrow", "Arimo", "Noto Serif", - "Rubik", "Fira Sans", "Work Sans", "Noto Sans KR", "Quicksand", "Dosis", "Inconsolata", "Oxygen", "Mukta", "Bitter", "Nanum Gothic", "Yanone Kaffeesatz", "Nunito Sans", "Lobster", "Cabin", "Fjalla One", "Indie Flower", "Anton", "Arvo", "Josefin Sans", "Karla", "Libre Baskerville", "Noto Sans TC", "Hind", "Crimson Text", "Hind Siliguri", - "Inter", "Heebo", "Abel", "Libre Franklin", "Barlow", "Varela Round", "Pacifico", "Dancing Script", "Exo 2", "Source Code Pro", "Shadows Into Light", "Merriweather Sans", "Asap", "Bree Serif", "Archivo Narrow", "Play", "Ubuntu Condensed", "Questrial", "Abril Fatface", "Source Serif Pro", "Maven Pro", "Francois One", "Signika", - "EB Garamond", "Comfortaa", "Exo", "Vollkorn", "Teko", "Catamaran", "Kanit", "Cairo", "Amatic SC", "IBM Plex Sans", "Cuprum", "Poiret One", "Rokkitt", "Bebas Neue", "Acme", "PT Sans Caption", "Righteous", "Noto Sans SC", "Alegreya Sans", "Alegreya", "Barlow Condensed", "Prompt", "Gloria Hallelujah", "Patua One", "Crete Round", "Permanent Marker"] - empty!(WordCloud.AvailableFonts) - append!(WordCloud.AvailableFonts, ["$f$w" for w in WordCloud.CandiWeights, f in google_fonts]) - function wordseg_cn(t) - jieba = pyimport("jieba") - pyconvert(Vector{String}, jieba.lcut(t)) - end - WordCloud.settokenizer!("zho", wordseg_cn) - WordCloud.settokenizer!("jpn", TinySegmenter.tokenize) - nothing -end - # ╔═╡ fa6b3269-357e-4bf9-8514-70aff9df427f begin - google_fonts # used to adjust cell order function gen_cloud(words_weights) if outlinewidth isa Number && outlinewidth >= 0 olw = outlinewidth @@ -441,6 +423,17 @@ begin end +# ╔═╡ b09620ef-4495-4c83-ad1c-2d8b0ed70710 +begin + function wordseg_cn(t) + jieba = pyimport("jieba") + pyconvert(Vector{String}, jieba.lcut(t)) + end + WordCloud.settokenizer!("zho", wordseg_cn) + WordCloud.settokenizer!("jpn", TinySegmenter.tokenize) + nothing +end + # ╔═╡ Cell order: # ╟─bda3fa85-04a3-4033-9890-a5b4f10e2a77 # ╟─9191230b-b72a-4707-b7cf-1a51c9cdb217 diff --git a/src/artist.jl b/src/artist.jl index 1abba31..cfacefd 100644 --- a/src/artist.jl +++ b/src/artist.jl @@ -1,45 +1,66 @@ using Random -SansSerifFonts = ["Trebuchet MS", "Heiti TC", "微軟正黑體", "Arial Unicode MS", "Droid Fallback Sans", "sans-serif", "Helvetica", "Verdana", "Hei", - "Arial", "Tahoma", "Microsoft Yahei", "Comic Sans MS", "Impact", "Segoe Script", "STHeiti", "Apple LiGothic", "MingLiU", "Ubuntu", "Segoe UI", - "DejaVu Sans", "DejaVu Sans Mono", "Noto Sans CJK", "Arial Black", "Gadget", "cursive", "Charcoal", "Lucida Sans Unicode", "Lucida Grande", "Geneva"] -SerifFonts = ["Baskerville", "Times New Roman", "Times", "華康儷金黑 Std", "華康儷宋 Std", "DFLiKingHeiStd-W8", "DFLiSongStd-W5", "DejaVu Serif", "SimSun", - "Hiragino Mincho Pro", "LiSong Pro", "新細明體", "serif", "Georgia", "STSong", "FangSong", "KaiTi", "STKaiti", "Courier", "Courier New", "monospace", - "Palatino Linotype", "Book Antiqua", "Palatino", "Lucida Console", "Monaco"] -CandiFonts = union(SansSerifFonts, SerifFonts) -CandiWeights = ["", " Regular", " Normal", " Medium", " Bold", " Light"] -function checkfonts(fonts::AbstractVector) - fname = tempname() - r = Bool[] - open(fname, "w") do f - redirect_stderr(f) do - p = position(f) - for font in fonts - err = false - try - rendertext("a", 1 + rand(), font=font) # 相同字体相同字号仅warning一次,故首次执行最准 - catch - err = true - end - # flush(f) # https://en.cppreference.com/w/cpp/io/c/fseek The standard C++ file streams guarantee both flushing and unshifting - seekend(f) - p2 = position(f) - push!(r, (p2 == p) && !err) - p = p2 +import Fontconfig: list, Pattern +using StopWords + +FontCandidates::Dict{String, Vector{String}} = Dict{String, Vector{String}}() +WeightCandidates::Vector{String} = ["", " Regular", " Normal", " Medium", " Bold", " Light"] + +function listfonts(lang="") + if !isempty(lang) + ps = list(Pattern(lang=lang)) + else + ps = list(Pattern()) + end + names = String[] + for p in ps + name = string(p) + b = findfirst("\"", name) + e = findfirst(":", name) + if b !== nothing && e !== nothing + b = nextind(name, first(b), 1) + e = prevind(name, first(e), 1) + if 0 < b < e < length(name) + push!(names, name[b:e]) end end end - return r + return names end -checkfonts(f) = checkfonts([f]) |> only -function filterfonts(;fonts=CandiFonts, weights=CandiWeights) - candi = ["$f$w" for w in weights, f in fonts] |> vec - candi[checkfonts(candi)] +function reverse_dict(d) + rd = Dict{String, Vector{String}}() + for (k, v) in d + get!(rd, v) do + String[] + end + push!(rd[v], k) + end + return rd end -if Sys.iswindows() - AvailableFonts = [""] -else - AvailableFonts = filterfonts() - push!(AvailableFonts, "") +const id_part1 = reverse_dict(StopWords.part1_id) +const mid_iid = reverse_dict(StopWords.iid_mid) +function expandlangcode(c) + c in StopWords.id_all || (c = get(StopWords.name_id, c, c)) + c in StopWords.id_all || (c = get(StopWords.name_id, titlecase(c), c)) + cs = [] + for c1 in Iterators.flatten((get(mid_iid, c, []), [c])) + for c2 in Iterators.flatten((get(id_part1, c1, []), [c1])) + push!(cs, c2) + end + end + cs +end +function fontsof(lang) + union((listfonts(l) for l in expandlangcode(lang))...) +end +function getfonts(lang) + if haskey(FontCandidates, lang) + return FontCandidates[lang] + else + fs = fontsof(lang) + push!(fs, "") + FontCandidates[lang] = fs + return fs + end end Schemes_colorbrewer = filter(s -> occursin("colorbrewer", colorschemes[s].category), collect(keys(colorschemes))) @@ -333,11 +354,13 @@ function randomlinecolor(colors) linecolor end randomoutline() = rand((0, 0, 0, rand(2:10))) -function randomfonts() +function randomfonts(lang="") if rand() < 0.8 - fonts = rand(AvailableFonts) + fonts = rand(getfonts(lang)) + fonts = fonts * rand(WeightCandidates) else - fonts = rand(AvailableFonts, 2 + floor(Int, 2randexp())) + fonts = rand(getfonts(lang), 2 + floor(Int, 2randexp())) + fonts = [f * rand(WeightCandidates) for f in fonts] rand() > 0.5 && (fonts = tuple(fonts...)) end @show fonts diff --git a/src/wc-class.jl b/src/wc-class.jl index fd8819c..0be8dee 100644 --- a/src/wc-class.jl +++ b/src/wc-class.jl @@ -73,12 +73,12 @@ wordcloud(wordsweights::Tuple; kargs...) = wordcloud(wordsweights...; kargs...) wordcloud(counter::AbstractDict; kargs...) = wordcloud(keys(counter) |> collect, values(counter) |> collect; kargs...) wordcloud(counter::AbstractVector{<:Union{Pair,Tuple,AbstractVector}}; kargs...) = wordcloud(first.(counter), [v[2] for v in counter]; kargs...) function wordcloud(text; language=:auto, stopwords=:auto, stopwords_extra=nothing, maxnum=500, kargs...) - wordcloud(processtext(text, language=language, stopwords=stopwords, stopwords_extra=stopwords_extra, maxnum=maxnum); kargs...) + wordcloud(processtext(text, language=language, stopwords=stopwords, stopwords_extra=stopwords_extra, maxnum=maxnum); language=language, kargs...) end wordcloud(words, weight::Number; kargs...) = wordcloud(words, repeat([weight], length(words)); kargs...) function wordcloud(words::AbstractVector{<:AbstractString}, weights::AbstractVector{<:Real}; colors=:auto, angles=:auto, - mask=:auto, fonts=:auto, + mask=:auto, fonts=:auto, language=:auto, transparent=:auto, minfontsize=:auto, maxfontsize=:auto, spacing::Integer=2, density=0.5, state=layout!, style=:auto, centralword=:auto, reorder=:auto, level=:auto, kargs...) @assert length(words) == length(weights) > 0 @@ -90,8 +90,8 @@ function wordcloud(words::AbstractVector{<:AbstractString}, weights::AbstractVec params[:reorder] = reorder params[:level] = level - colors, angles, mask, svgmask, fonts, transparent = getstylescheme(words, weights; colors=colors, angles=angles, - mask=mask, fonts=fonts, transparent=transparent, params=params, kargs...) + colors, angles, mask, svgmask, fonts, transparent = getstylescheme(words, weights; colors=colors, angles=angles, mask=mask, + fonts=fonts, language=language, transparent=transparent, params=params, kargs...) params[:colors] = Any[colors...] params[:angles] = angles params[:transparent] = transparent @@ -138,7 +138,7 @@ end function getstylescheme(words, weights; colors=:auto, angles=:auto, mask=:auto, masksize=:auto, maskcolor=:default, keepmaskarea=:auto, backgroundcolor=:default, padding=:default, - outline=:default, linecolor=:auto, fonts=:auto, + outline=:default, linecolor=:auto, fonts=:auto, language=:auto, transparent=:auto, params=Dict{Symbol,Any}(), kargs...) merge!(params, kargs) colors in DEFAULTSYMBOLS && (colors = randomscheme(weights)) @@ -249,7 +249,8 @@ function getstylescheme(words, weights; colors=:auto, angles=:auto, mask=:auto, Render.recolor!(mask, maskcolor) # tobitmap后有杂色 https://github.com/JuliaGraphics/Luxor.jl/issues/160 end end - fonts in DEFAULTSYMBOLS && (fonts = randomfonts()) + lang = language in DEFAULTSYMBOLS ? "" : language + fonts in DEFAULTSYMBOLS && (fonts = randomfonts(lang)) fonts = Iterators.take(iter_expand(fonts), length(words)) |> collect colors, angles, mask, svgmask, fonts, transparent end