-
Notifications
You must be signed in to change notification settings - Fork 26
/
cpdfbookmarks.ml
411 lines (385 loc) · 16.9 KB
/
cpdfbookmarks.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
open Pdfutil
open Cpdferror
(* Verify a list of bookmarks. Positive jumps of > 1 not allowed, no numbers
smaller than 0. *)
let rec verify_bookmarks pdf lastlevel fastrefnums endpage = function
| [] -> true
| {Pdfmarks.level = level; Pdfmarks.target = target}::more ->
let page = Pdfpage.pagenumber_of_target pdf ~fastrefnums target in
level < lastlevel + 2 &&
level >= 0 &&
page <= endpage &&
page >= 0 &&
verify_bookmarks pdf level fastrefnums endpage more
let verify_bookmarks pdf lastlevel endpage marks =
let refnums = Pdf.page_reference_numbers pdf in
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
match marks with
| [] -> true
| m::more -> m.Pdfmarks.level = 0 && verify_bookmarks pdf lastlevel fastrefnums endpage more
(* Parse a line of the bookmarks file. *)
(* Un-escape things which are escaped. Quotes, newlines and backslashes *)
let rec fixup_characters prev = function
| [] -> rev prev
| '\\'::'\\'::t -> fixup_characters ('\\'::prev) t
| '\\'::'"'::t -> fixup_characters ('"'::prev) t
| '\\'::'n'::t -> fixup_characters ('\n'::prev) t
| h::t -> fixup_characters (h::prev) t
(* If optionaldest = [Pdfgenlex.LexString s], we parse the string, convert the
* integer to an indirect of the real page target, and then put it in. *)
let target_of_markfile_obj pdf i' pdfobj =
(*Printf.printf "Parsed %s\n" (Pdfwrite.string_of_pdf pdfobj);*)
match pdfobj with
Pdf.Array (Pdf.Integer x::more) ->
let pageobjnum = Pdfpage.page_object_number pdf i' in
begin match pageobjnum with
None ->
raise (Pdf.PDFError "bookmark_of_data: page obj num not found")
| Some p ->
Pdfdest.read_destination pdf (Pdf.Array (Pdf.Indirect p::more))
end
(* Need to deal with "null", "(string)", and "<<other thing like action" *)
| Pdf.Null -> Pdfdest.NullDestination
| Pdf.String s -> Pdfdest.read_destination pdf (Pdf.String s)
| x -> Pdfdest.Action x
let target_of_markfile_target pdf i' = function
| [Pdfgenlex.LexString s] ->
let pdfobj = Pdfread.parse_single_object s in
target_of_markfile_obj pdf i' pdfobj
| _ -> Pdfpage.target_of_pagenumber pdf i'
let bookmark_of_data pdf i s i' isopen optionaldest =
{Pdfmarks.level = i;
Pdfmarks.text = Pdftext.pdfdocstring_of_utf8 (implode (fixup_characters [] (explode s)));
Pdfmarks.target = target_of_markfile_target pdf i' optionaldest;
Pdfmarks.isopen = isopen;
Pdfmarks.colour = (0., 0., 0.);
Pdfmarks.flags = 0} (* Old bookmark file format does not support colour or flags *)
let target_of_json_target pdf pagenumber target =
target_of_markfile_obj pdf pagenumber (Cpdfjson.object_of_json target)
let mark_of_json pdf = function
| `Assoc [("level", `Int level);
("text", `String text);
("page", `Int pagenumber);
("open", `Bool openstatus);
("target", target);
("colour", `List [`Float r; `Float g; `Float b]);
("italic", `Bool italic);
("bold", `Bool bold)] ->
{Pdfmarks.level = level;
Pdfmarks.text = Pdftext.pdfdocstring_of_utf8 text;
Pdfmarks.target = target_of_json_target pdf pagenumber target;
Pdfmarks.isopen = openstatus;
Pdfmarks.colour = (r, g, b);
Pdfmarks.flags = (if italic then 1 else 0) lor ((if bold then 1 else 0) lsl 1)}
| _ -> error "malformed mark in mark_of_json"
let marks_of_json pdf = function
| `List ms -> map (mark_of_json pdf) ms
| _ -> error "top level of JSON boomark file not a list"
let parse_bookmark_file_json verify pdf i =
let module J = Cpdfyojson.Safe in
try
let json =
match i.Pdfio.caml_channel with
| Some ch -> J.from_channel ch
| None ->
let content = Pdfio.string_of_input i in
J.from_string content
in
let marks = marks_of_json pdf json in
if verify then
if verify_bookmarks pdf 0 (Pdfpage.endpage pdf) marks then marks else
error "Bad bookmark file (References non-existant pages or is malformed)"
else
marks
with
e ->
error (Printf.sprintf "Malformed JSON bookmark file (%s)" (Printexc.to_string e))
let parse_bookmark_file verify pdf input =
let currline = ref 0 in
try
let lines = Pdfio.read_lines input in
let currline = ref 0 in
let bookmarks = ref [] in
iter
(function line ->
match
incr currline;
Pdfgenlex.lex_string line
with
| Pdfgenlex.LexInt i::Pdfgenlex.LexString s::Pdfgenlex.LexInt i'::Pdfgenlex.LexName "open"::optionaldest ->
bookmarks =| bookmark_of_data pdf i s i' true optionaldest
| Pdfgenlex.LexInt i::Pdfgenlex.LexString s::Pdfgenlex.LexInt i'::optionaldest ->
bookmarks =| bookmark_of_data pdf i s i' false optionaldest
| [] -> () (* ignore blank lines *)
| _ ->
error ("Bad bookmark file, line " ^ (string_of_int !currline)))
lines;
let bookmarks = rev !bookmarks in
if verify then
if verify_bookmarks pdf 0 (Pdfpage.endpage pdf) bookmarks
then bookmarks
else
error
"Bad bookmark file (References non-existant pages or is malformed)"
else
bookmarks
with
e ->
error
(Printf.sprintf
"Bad bookmark file (syntax) at line %i (error was %s)"
!currline
(Printexc.to_string e))
let add_bookmarks ~json verify input pdf =
let parsed =
(if json then parse_bookmark_file_json else parse_bookmark_file) verify pdf input in
(*iter (fun b -> flprint (Pdfmarks.string_of_bookmark b); flprint "\n") parsed;*)
Pdfmarks.add_bookmarks parsed pdf
(* List bookmarks *)
let output_string_of_target pdf fastrefnums x =
match Pdfdest.pdfobject_of_destination x with
| Pdf.Array (_::more) ->
let a =
Pdf.Array (Pdf.Integer (Pdfpage.pagenumber_of_target ~fastrefnums pdf x)::more)
in
"\"" ^ Pdfwrite.string_of_pdf a ^ "\""
| x -> "\"" ^ Pdfwrite.string_of_pdf x ^ "\""
let json_of_target pdf fastrefnums x =
match Pdfdest.pdfobject_of_destination x with
| Pdf.Array (_::more) ->
let a =
Pdf.Array (Pdf.Integer (Pdfpage.pagenumber_of_target ~fastrefnums pdf x)::more)
in
Cpdfjson.json_of_object pdf (fun _ -> ()) ~utf8:false ~no_stream_data:false ~parse_content:false a
| x -> Cpdfjson.json_of_object pdf (fun _ -> ()) ~utf8:false ~no_stream_data:false ~parse_content:false x
let output_json_marks output calculate_page_number pdf fastrefnums marks =
let module J = Cpdfyojson.Safe in
let json_of_mark m =
let r, g, b = m.Pdfmarks.colour in
`Assoc
[("level", `Int m.Pdfmarks.level);
("text", `String (Pdftext.utf8_of_pdfdocstring (Pdftext.simplify_utf16be m.Pdfmarks.text)));
("page", `Int (calculate_page_number m));
("open", `Bool m.Pdfmarks.isopen);
("target", json_of_target pdf fastrefnums m.Pdfmarks.target);
("colour", `List [`Float r; `Float g; `Float b]);
("italic", `Bool (m.Pdfmarks.flags land 1 > 0));
("bold", `Bool (m.Pdfmarks.flags land 2 > 0))]
in
let json = `List (map json_of_mark marks) in
output.Pdfio.output_string (J.pretty_to_string json)
let process_string encoding s =
let rec replace c x y = function
| [] -> []
| h::t when h = c -> x::y::replace c x y t
| h::t -> h::replace c x y t
in
(* Convert to UTF8, raw, or stripped, and escape backslashed and quotation marks *)
let codepoints = Pdftext.codepoints_of_pdfdocstring s in
let escaped =
let bs = int_of_char '\\'
and nl = int_of_char '\n'
and n = int_of_char 'n'
and q = int_of_char '\"' in
replace q bs q (replace nl bs n (replace bs bs bs codepoints))
in
let process_stripped escaped =
let b = Buffer.create 200 in
iter
(fun x ->
if x <= 127 then Buffer.add_char b (char_of_int x))
escaped;
Buffer.contents b
in
match encoding with
| Cpdfmetadata.UTF8 -> Pdftext.utf8_of_codepoints escaped
| Cpdfmetadata.Stripped -> process_stripped escaped
| Cpdfmetadata.Raw -> s
(* List the bookmarks in the given range to the given output *)
let list_bookmarks ~json encoding range pdf output =
let bookmarks = Pdfmarks.read_bookmarks pdf in
let refnums = Pdf.page_reference_numbers pdf in
let rangetable = hashset_of_list range in
let range_is_all = range = ilist 1 (Pdfpage.endpage pdf) in
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
(* Find the pagenumber of each bookmark target. If it is in the range,
* keep that bookmark. Also keep the bookmark if its target is the null
* destination. *)
let inrange =
keep
(function x ->
range_is_all ||
x.Pdfmarks.target = Pdfdest.NullDestination ||
(match x.Pdfmarks.target with Pdfdest.NamedDestinationElsewhere _ -> true | _ -> false) ||
Hashtbl.mem rangetable (Pdfpage.pagenumber_of_target ~fastrefnums pdf x.Pdfmarks.target)) bookmarks
in
let calculate_page_number mark =
(* Some buggy PDFs use integers for page numbers instead of page
* object references. Adobe Reader and Preview seem to support
* this, for presumably historical reasons. So if we see a
* OtherDocPageNumber (which is what Pdfdest parses these as,
* because that's what they are legitimately, we use this as the
* page number. It is zero based, though, and we are one-based, so
* we add one. Pdfpage.pagenumber_of_target has been modified to support this.*)
Pdfpage.pagenumber_of_target ~fastrefnums pdf mark.Pdfmarks.target
in
if json then
output_json_marks output calculate_page_number pdf fastrefnums inrange
else
iter
(function mark ->
output.Pdfio.output_string
(Printf.sprintf "%i \"%s\" %i%s %s\n"
mark.Pdfmarks.level
(process_string encoding mark.Pdfmarks.text)
(calculate_page_number mark)
(if mark.Pdfmarks.isopen then " open" else "")
(output_string_of_target pdf fastrefnums mark.Pdfmarks.target)))
inrange
let get_bookmarks_json pdf =
let o, br = Pdfio.input_output_of_bytes (20 * 1024) in
list_bookmarks ~json:true Cpdfmetadata.UTF8 (ilist 1 (Pdfpage.endpage pdf)) pdf o;
Pdfio.extract_bytes_from_input_output o br
let get_bookmark_name encoding pdf marks splitlevel n _ =
let refnums = Pdf.page_reference_numbers pdf in
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
match keep (function m -> n = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target && m.Pdfmarks.level <= splitlevel) marks with
| {Pdfmarks.text = title}::_ -> Cpdfattach.remove_unsafe_characters (Pdftext.utf8_of_pdfdocstring title)
| _ -> ""
(* @F means filename without extension *)
(* @N means sequence number with no padding *)
(* @S means start page of this section *)
(* @E means end page of this section *)
(* @B means bookmark name at start page *)
(* @b52@ means bookmark name at start page truncated to 52 characters using crude UTF8 truncation. *)
let process_others encoding marks pdf splitlevel filename sequence startpage endpage s =
let trim_utf8 len l =
let l = try take l len with _ -> l in
(* This truncator is far from perfect, but it does yield a valid UTF8 string, when given one. *)
match rev l with
| b2::b1::b0::t ->
if int_of_char b2 land 0x80 > 0 then
begin
if int_of_char b2 land 0x40 > 0 then rev (b1::b0::t)
else if int_of_char b1 land 0xe0 = 0xe0 then rev (b0::t)
else if int_of_char b0 land 0xf0 = 0xf0 then rev t
else l
end
else
l
| _ -> l
in
let rec find_ats p = function
'@'::r -> find_ats (p + 1) r
| r -> (p, r)
in
let string_of_int_width w i =
if w < 0 then raise (Pdf.PDFError "width of field too narrow")
else if w > 8 then raise (Pdf.PDFError "width of field too broad") else
let formats =
[|format_of_string "%i";
format_of_string "%i";
format_of_string "%02i";
format_of_string "%03i";
format_of_string "%04i";
format_of_string "%05i";
format_of_string "%06i";
format_of_string "%07i";
format_of_string "%08i"|]
in
Printf.sprintf formats.(w) i
in
let rec procss prev = function
| [] -> rev prev
| '@'::'F'::t -> procss (rev (explode filename) @ prev) t
| '@'::'N'::t ->
let width, rest = find_ats 0 t in
procss (rev (explode (string_of_int_width width sequence)) @ prev) rest
| '@'::'S'::t ->
let width, rest = find_ats 0 t in
procss (rev (explode (string_of_int_width width startpage)) @ prev) rest
| '@'::'E'::t ->
let width, rest = find_ats 0 t in
procss (rev (explode (string_of_int_width width endpage)) @ prev) rest
| '@'::'B'::t ->
procss (rev (explode (get_bookmark_name encoding pdf marks splitlevel startpage pdf)) @ prev) t
| '@'::'b'::t ->
let number, rest = cleavewhile (function '0'..'9' -> true | _ -> false) t in
begin try ignore (int_of_string (implode number)) with _ -> error "Bad @b spec" end;
let text = trim_utf8 (int_of_string (implode number)) (explode (get_bookmark_name encoding pdf marks splitlevel startpage pdf)) in
procss (rev text @ prev) (if rest = [] then [] else tl rest)
| h::t -> procss (h::prev) t
in
implode (procss [] (explode s))
let name_of_spec encoding marks (pdf : Pdf.t) splitlevel spec n filename startpage endpage =
let fill l n =
let chars = explode (string_of_int n) in
if length chars > l
then implode (drop chars (length chars - l))
else implode ((many '0' (l - length chars)) @ chars)
in
let chars = explode spec in
let before, including = cleavewhile (neq '%') chars in
let percents, after = cleavewhile (eq '%') including in
if percents = []
then
process_others encoding marks pdf splitlevel filename n startpage endpage spec
else
process_others encoding marks pdf splitlevel filename n startpage endpage
(implode before ^ fill (length percents) n ^ implode after)
(* Indent bookmarks in each file by one and add a title bookmark pointing to the first page. *)
let add_bookmark_title filename use_title pdf =
let title =
if use_title then
match Cpdfmetadata.get_info_utf8 pdf "/Title", Cpdfmetadata.get_xmp_info pdf "/Title" with
"", x | x, "" | _, x -> x
else
Filename.basename filename
in
let marks = Pdfmarks.read_bookmarks pdf in
let page1objnum =
match Pdfpage.page_object_number pdf 1 with
None -> error "add_bookmark_title: page not found"
| Some x -> x
in
let newmarks =
{Pdfmarks.level = 0;
Pdfmarks.text = Pdftext.pdfdocstring_of_utf8 title;
Pdfmarks.target = Pdfdest.XYZ (Pdfdest.PageObject page1objnum, None, None, None);
Pdfmarks.isopen = false;
Pdfmarks.colour = (0., 0., 0.);
Pdfmarks.flags = 0}
::map (function m -> {m with Pdfmarks.level = m.Pdfmarks.level + 1}) marks
in
Pdfmarks.add_bookmarks newmarks pdf
let bookmarks_open_to_level n pdf =
let marks = Pdfmarks.read_bookmarks pdf in
let newmarks =
map
(fun m -> {m with Pdfmarks.isopen = m.Pdfmarks.level < n})
marks
in
Pdfmarks.add_bookmarks newmarks pdf
(* Alter bookmark destinations given a hash table of (old page reference
* number, new page reference number) pairings *)
let change_destination t = function
Pdfdest.XYZ (Pdfdest.PageObject p, a, b, c) ->
Pdfdest.XYZ (Pdfdest.PageObject (Hashtbl.find t p), a, b, c)
| Pdfdest.Fit (Pdfdest.PageObject p) ->
Pdfdest.Fit (Pdfdest.PageObject (Hashtbl.find t p))
| Pdfdest.FitH (Pdfdest.PageObject p, x) ->
Pdfdest.FitH (Pdfdest.PageObject (Hashtbl.find t p), x)
| Pdfdest.FitV (Pdfdest.PageObject p, x) ->
Pdfdest.FitV (Pdfdest.PageObject (Hashtbl.find t p), x)
| Pdfdest.FitR (Pdfdest.PageObject p, a, b, c, d) ->
Pdfdest.FitR (Pdfdest.PageObject (Hashtbl.find t p), a, b, c, d)
| Pdfdest.FitB (Pdfdest.PageObject p) ->
Pdfdest.Fit (Pdfdest.PageObject (Hashtbl.find t p))
| Pdfdest.FitBH (Pdfdest.PageObject p, x) ->
Pdfdest.FitBH (Pdfdest.PageObject (Hashtbl.find t p), x)
| Pdfdest.FitBV (Pdfdest.PageObject p, x) ->
Pdfdest.FitBV (Pdfdest.PageObject (Hashtbl.find t p), x)
| x -> x
let change_bookmark t m =
{m with Pdfmarks.target =
try change_destination t m.Pdfmarks.target with Not_found -> m.Pdfmarks.target}