-
Notifications
You must be signed in to change notification settings - Fork 26
/
cpdfattach.ml
358 lines (344 loc) · 15.2 KB
/
cpdfattach.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
open Pdfutil
open Pdfio
open Cpdferror
(* Remove characters which might not make good filenames. In, UTF8, out UTF8. *)
let remove_unsafe_characters s =
let codepoints = Pdftext.codepoints_of_utf8 s in
let codepoints =
lose
(function x ->
x = int_of_char '/'
|| x = int_of_char '?'
|| x = int_of_char '<'
|| x = int_of_char '>'
|| x = int_of_char '\\'
|| x = int_of_char ':'
|| x = int_of_char '*'
|| x = int_of_char '|'
|| x = int_of_char '\"'
|| x = int_of_char '^'
|| x = int_of_char '+'
|| x = int_of_char '='
|| x < 32
|| x = 127)
codepoints
in
match codepoints with
| 46::more -> Pdftext.utf8_of_codepoints codepoints (* Don't produce a dotfile *)
| chars -> Pdftext.utf8_of_codepoints codepoints
(* Attaching files *)
let attach_file ?memory keepversion topage pdf file =
let data =
match memory with
Some data -> data
| None ->
let ch = open_in_bin file in
let len = in_channel_length ch in
let stream = mkbytes len in
let i = input_of_channel ch in
setinit i stream 0 len;
close_in ch;
stream
in
let filestream =
Pdf.Stream
(ref (Pdf.Dictionary
[("/Length", Pdf.Integer (bytes_size data));
("/Type", Pdf.Name "/EmbeddedFile");
("/Params",
Pdf.Dictionary
[("/Size", Pdf.Integer (bytes_size data));
("/CheckSum", Pdf.String (Digest.string (string_of_bytes data)))
])],
Pdf.Got data))
in
let filestream_num = Pdf.addobj pdf filestream in
let basename = Pdftext.pdfdocstring_of_utf8 (Filename.basename file) in
let filespec =
Pdf.Dictionary
[("/EF", Pdf.Dictionary ["/F", Pdf.Indirect filestream_num]);
("/F", Pdf.String basename);
("/Type", Pdf.Name "/Filespec");
("/Desc", Pdf.String "");
("/UF", Pdf.String basename)]
in
match topage with
| None ->
(* Look up /Names and /EmbeddedFiles and /Names. *)
let rootdict = Pdf.lookup_obj pdf pdf.Pdf.root in
let namedict =
match Pdf.lookup_direct pdf "/Names" rootdict with
| None -> Pdf.Dictionary []
| Some namedict -> namedict
in
let embeddednamedict =
match Pdf.lookup_direct pdf "/EmbeddedFiles" namedict with
| None -> Pdf.Dictionary []
| Some embeddednamedict -> embeddednamedict
in
let elts =
match Pdf.lookup_direct pdf "/Names" embeddednamedict with
| Some (Pdf.Array elts) -> elts
| _ -> []
in
let filespecobj = Pdf.addobj pdf filespec in
let names' = Pdf.Array (elts @ [Pdf.String basename; Pdf.Indirect filespecobj]) in
let embeddednamedict' = Pdf.add_dict_entry embeddednamedict "/Names" names' in
let namedict' = Pdf.add_dict_entry namedict "/EmbeddedFiles" embeddednamedict' in
let rootdict' = Pdf.add_dict_entry rootdict "/Names" namedict' in
let rootnum = Pdf.addobj pdf rootdict' in
{pdf with
Pdf.minor = if keepversion || pdf.Pdf.major > 1 then pdf.Pdf.minor else max pdf.Pdf.minor 4;
Pdf.root = rootnum;
Pdf.trailerdict =
Pdf.add_dict_entry
pdf.Pdf.trailerdict "/Root" (Pdf.Indirect rootnum)}
| Some pagenumber ->
let pages = Pdfpage.pages_of_pagetree pdf in
if pagenumber < 0 || pagenumber > length pages then error "attach_file: Page not found" else
let page = select pagenumber pages in
let annots =
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array annots) -> annots
| _ -> []
in
let rect =
let minx, miny, maxx, maxy = Pdf.parse_rectangle pdf page.Pdfpage.mediabox in
Pdf.Array [Pdf.Real 18.; Pdf.Real (maxy -. 45.); Pdf.Real 45.; Pdf.Real (maxy -. 18.)]
in
let filespecobj = Pdf.addobj pdf filespec in
let annot =
Pdf.Dictionary
[("/FS", Pdf.Indirect filespecobj);
("/Subtype", Pdf.Name "/FileAttachment");
("/Contents", Pdf.String basename);
("/Rect", rect)]
in
let annots' = Pdf.Array (annot::annots) in
let page' =
{page with Pdfpage.rest = Pdf.add_dict_entry page.Pdfpage.rest "/Annots" annots'}
in
let pages' = replace_number pagenumber page' pages in
let pdf = Pdfpage.change_pages true pdf pages' in
{pdf with
Pdf.minor = if keepversion || pdf.Pdf.major > 1 then pdf.Pdf.minor else max pdf.Pdf.minor 4}
type attachment =
{name : string;
pagenumber : int;
data : unit -> Pdfio.bytes}
let list_attached_files pdf =
let toplevel =
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| None -> []
| Some rootdict ->
match Pdf.lookup_direct pdf "/Names" rootdict with
| None -> []
| Some namedict ->
match Pdf.lookup_direct pdf "/EmbeddedFiles" namedict with
| Some nametree ->
map
(function (x, ef) ->
match Pdf.lookup_direct pdf "/EF" ef with
| Some ((Pdf.Dictionary _) as d) ->
begin match Pdf.lookup_direct pdf "/F" d with
| Some stream ->
{name = Pdftext.utf8_of_pdfdocstring x;
pagenumber = 0;
data =
(fun () ->
try
Pdf.getstream stream;
Pdfcodec.decode_pdfstream pdf stream;
match stream with
Pdf.Stream {contents = (_, Pdf.Got data)} -> data
| _ -> raise Not_found
with
_ -> raise (Pdf.PDFError "could not retreive attachment data"))}
| None -> raise (Pdf.PDFError "/F not found")
end
| _ -> raise (Pdf.PDFError "/EF not found"))
(option_map
(function (Pdf.String s, ef) -> Some (s, ef) | _ -> None)
(Pdf.contents_of_nametree pdf nametree))
| _ -> []
in let pagelevel =
let pages = Pdfpage.pages_of_pagetree pdf in
flatten
(map2
(fun page pagenumber ->
option_map
(function annot ->
match Pdf.lookup_direct pdf "/Subtype" annot with
| Some (Pdf.Name "/FileAttachment") ->
(match Pdf.lookup_direct pdf "/Contents" annot with
| Some (Pdf.String s) ->
begin match Pdf.lookup_direct pdf "/FS" annot with
| Some ((Pdf.Dictionary _) as d) ->
(*Pdfe.log (Printf.sprintf "%s\n" (Pdfwrite.string_of_pdf d));*)
begin match Pdf.lookup_direct pdf "/EF" d with
| Some ((Pdf.Dictionary _) as d) ->
begin match Pdf.lookup_direct pdf "/F" d with
| Some stream ->
Some
{name = Pdftext.utf8_of_pdfdocstring s;
pagenumber = pagenumber;
data =
(fun () ->
try
Pdf.getstream stream;
Pdfcodec.decode_pdfstream pdf stream;
match stream with
Pdf.Stream {contents = (_, Pdf.Got data)} -> data
| _ -> raise Not_found
with
_ -> raise (Pdf.PDFError "could not retreive attachment data"))}
| _ -> raise (Pdf.PDFError "no /F found in attachment")
end
| _ ->
Some
{name = Pdftext.utf8_of_pdfdocstring s;
pagenumber = pagenumber;
data = (fun () -> raise (Pdf.PDFError "no attachment data"))}
end
| _ -> None
end
| _ -> None)
| _ -> None)
(match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array annots) -> annots
| _ -> []))
pages
(indx pages))
in
toplevel @ pagelevel
(* Remove Attached files *)
let remove_attached_files_on_pages pdf =
let remove_from_page page =
{page with Pdfpage.rest =
Pdf.add_dict_entry page.Pdfpage.rest "/Annots"
(Pdf.Array
(option_map
(function annot ->
match Pdf.lookup_direct pdf "/Subtype" annot with
| Some (Pdf.Name "/FileAttachment") -> None
| _ -> Some annot)
(match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array annots) -> annots
| _ -> [])))}
in
Pdfpage.change_pages true pdf (map remove_from_page (Pdfpage.pages_of_pagetree pdf))
let remove_attached_files pdf =
let pdf = remove_attached_files_on_pages pdf in
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| None -> pdf
| Some rootdict ->
match Pdf.lookup_direct pdf "/Names" rootdict with
| None -> pdf
| Some namedict ->
let namedict' = Pdf.remove_dict_entry namedict "/EmbeddedFiles" in
let rootdict' = Pdf.add_dict_entry rootdict "/Names" namedict' in
let rootdict'num = Pdf.addobj pdf rootdict' in
{pdf with
Pdf.root =
rootdict'num;
Pdf.trailerdict =
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect rootdict'num)}
let dump_attachment out pdf (_, embeddedfile) =
match Pdf.lookup_direct pdf "/F" embeddedfile with
| Some (Pdf.String s) ->
let efdata =
begin match Pdf.lookup_direct pdf "/EF" embeddedfile with
| Some d ->
let stream =
match Pdf.lookup_direct pdf "/F" d with
| Some s -> s
| None -> error "Bad embedded file stream"
in
Pdfcodec.decode_pdfstream_until_unknown pdf stream;
begin match stream with Pdf.Stream {contents = (_, Pdf.Got b)} -> b | _ -> error "Bad embedded file stream" end
| _ -> error "Bad embedded file stream"
end
in
let s = remove_unsafe_characters (Pdftext.utf8_of_pdfdocstring s) in
let filename = if out = "" then s else out ^ Filename.dir_sep ^ s in
begin try
let fh = open_out_bin filename in
for x = 0 to bytes_size efdata - 1 do output_byte fh (bget efdata x) done;
close_out fh
with
e -> Pdfe.log (Printf.sprintf "Failed to write attachment to %s\n" filename);
end
| _ -> ()
let dump_attached_document pdf out =
let root = Pdf.lookup_obj pdf pdf.Pdf.root in
let names =
match Pdf.lookup_direct pdf "/Names" root with Some n -> n | _ -> Pdf.Dictionary []
in
match Pdf.lookup_direct pdf "/EmbeddedFiles" names with
| Some x ->
iter (dump_attachment out pdf) (Pdf.contents_of_nametree pdf x)
| None -> ()
let dump_attached_page pdf out page =
let annots =
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array l) -> l
| _ -> []
in
let efannots =
keep
(fun annot ->
match Pdf.lookup_direct pdf "/Subtype" annot with
| Some (Pdf.Name "/FileAttachment") -> true
| _ -> false)
annots
in
let fsannots = option_map (Pdf.lookup_direct pdf "/FS") efannots in
iter (dump_attachment out pdf) (map (fun x -> 0, x) fsannots)
(* Dump both document-level and page-level attached files to file, using their file names *)
let dump_attached_files pdf out =
try
dump_attached_document pdf out;
iter (dump_attached_page pdf out) (Pdfpage.pages_of_pagetree pdf)
with
e -> error (Printf.sprintf "Couldn't dump attached files: %s\n" (Printexc.to_string e))
let size_attachment pdf (_, embeddedfile) =
match Pdf.lookup_direct pdf "/F" embeddedfile with
| Some (Pdf.String s) ->
begin match Pdf.lookup_direct pdf "/EF" embeddedfile with
| Some d ->
let stream =
match Pdf.lookup_direct pdf "/F" d with
| Some s -> s
| None -> error "Bad embedded file stream"
in
begin match stream with Pdf.Stream {contents = (_, Pdf.Got b)} -> bytes_size b | _ -> error "Bad embedded file stream" end
| _ -> error "Bad embedded file stream"
end
| _ -> 0
let size_page_files pdf page =
let annots =
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array l) -> l
| _ -> []
in
let efannots =
keep
(fun annot ->
match Pdf.lookup_direct pdf "/Subtype" annot with
| Some (Pdf.Name "/FileAttachment") -> true
| _ -> false)
annots
in
let fsannots = option_map (Pdf.lookup_direct pdf "/FS") efannots in
map (size_attachment pdf) (map (fun x -> 0, x) fsannots)
let size_document_files pdf =
let root = Pdf.lookup_obj pdf pdf.Pdf.root in
let names =
match Pdf.lookup_direct pdf "/Names" root with Some n -> n | _ -> Pdf.Dictionary []
in
match Pdf.lookup_direct pdf "/EmbeddedFiles" names with
| Some x ->
sum (map (size_attachment pdf) (Pdf.contents_of_nametree pdf x))
| None -> 0
let size_attached_files pdf =
size_document_files pdf + sum (flatten (map (size_page_files pdf) (Pdfpage.pages_of_pagetree pdf)))