-
Notifications
You must be signed in to change notification settings - Fork 29
/
pdf.mli
320 lines (236 loc) · 12 KB
/
pdf.mli
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
(** Representing PDF Files in Memory *)
(** {2 PDF Objects} *)
type toget
(** A stream is either in memory, or at a position and of a length in an
[Pdfio.input]. *)
type stream =
| Got of Pdfio.bytes
| ToGet of toget
(** PDF objects. An object is a tree-like structure containing various things.
A PDF file is basically a directed graph of objects. *)
type pdfobject =
| Null
| Boolean of bool
| Integer of int
| Real of float
| String of string
| Name of string
| Array of pdfobject list
| Dictionary of (string * pdfobject) list
| Stream of (pdfobject * stream) ref
| Indirect of int
(** {2 The Object map} *)
(** You should not expect to manipulate these types and functions directly. *)
(** This type represents a possibly-parsed, possibly-decrypted, possibly-read-from-an-object-stream object. *)
type objectdata =
(* Not from an object stream, fully parsed, not necessarily decrypted yet *)
| Parsed of pdfobject
(* Was from an object stream, decrypted already when object stream read *)
| ParsedAlreadyDecrypted of pdfobject
(* Not parsed yet. Needs to be read from an object, which may still be encrypted *)
| ToParse
(* (stream object number, index in stream) Not parsed yet. Will come from an object stream. *)
| ToParseFromObjectStream of (int, int list) Hashtbl.t * int * int * (int -> int list -> (int * (objectdata ref * int)) list)
type pdfobjmap_key = int
type pdfobjmap = (pdfobjmap_key, objectdata ref * int) Hashtbl.t
(** The object map maps object numbers [pdfobjmap_key] to a reference to the
object data and the generation number *)
(** Make an empty object map *)
val pdfobjmap_empty : unit -> pdfobjmap
(** Find an object in the object map *)
val pdfobjmap_find : pdfobjmap_key -> pdfobjmap -> objectdata ref * int
(** The objects. Again, you won't normally manipulate this directly.
[maxobjnum] is the biggest object number seen yet. [parse] is a function to
parse a non-object stream object given its object number, [pdfobjects] is the
object map itself. [object_stream_ids] is a hash table of (object number,
was-stored-in-obect-stream-number) pairs, which is used to reconstruct stream
objects when preserving them upon write. *)
type pdfobjects =
{mutable maxobjnum : int;
mutable parse : (pdfobjmap_key -> pdfobject) option;
mutable pdfobjects : pdfobjmap;
mutable object_stream_ids : (int, int) Hashtbl.t}
(** {2 The PDF document} *)
type saved_encryption =
{from_get_encryption_values :
Pdfcryptprimitives.encryption * string * string * int32 * string * string option * string option;
encrypt_metadata : bool;
perms : string}
type deferred_encryption =
{crypt_type : Pdfcryptprimitives.encryption;
file_encryption_key : string option;
obj : int;
gen : int;
key : int array;
keylength : int;
r : int}
(** A Pdf document. Major and minor version numbers, object number of root, the
objects objects and the trailer dictionary as a [Dictionary] [pdfobject]. *)
type t =
{mutable major : int;
mutable minor : int;
mutable root : int;
mutable objects : pdfobjects;
mutable trailerdict : pdfobject;
mutable was_linearized : bool;
mutable saved_encryption : saved_encryption option}
(** The empty document (PDF 1.0, no objects, no root, empty trailer dictionary).
Note this is not a well-formed PDF. *)
val empty : unit -> t
(** {2 Exceptions and errors} *)
(** This exception is raised when some malformity in a PDF is found -- quite a
wide range of circumstances, and may be raised from many functions. *)
exception PDFError of string
(** This function, given a [Pdfio.input] and an ancilliary string, builds an
error string which includes the source of the Pdfio.input (filename, string,
bytes etc) so we can trace what it was originally built from *)
val input_pdferror : Pdfio.input -> string -> string
(** {2 Useful utilities} *)
(** Get a stream from disc if it hasn't already been got. The input is a
[Stream pdfobject]. *)
val getstream : pdfobject -> unit
(** Return a float from a [Real], an [Int] or an [Indirect] *)
val getnum : t -> pdfobject -> float
(** Lookup an object in a document, parsing it if required. Raises [Not_found]
if the object does not exist. *)
val lookup_obj : t -> int -> pdfobject
(** [lookup_fail errtext doc key dict] looks up a key in a PDF dictionary or the
dictionary of a PDF stream. Fails with [PDFError errtext] if the key is not
found. Follows indirect object links. *)
val lookup_fail : string -> (t -> string -> pdfobject -> pdfobject)
(** Same, but with customised exception. *)
val lookup_exception : exn -> t -> string -> pdfobject -> pdfobject
(** [lookup_direct doc key dict] looks up the key, resolving indirections at
source and destination, returning an option type. *)
val lookup_direct : t -> string -> pdfobject -> pdfobject option
(** [lookup_immediate key dict] looks up the key returning the value, without
following indirects at either source or destination. *)
val lookup_immediate : string -> pdfobject -> pdfobject option
(** [lookup_chain doc start keys] looks up the key in a nested dictionary. For
example [lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot";
"/RoleMap"]] *)
val lookup_chain : t -> pdfobject -> string list -> pdfobject option
(** [replace_chain doc chain obj] sets the object at the given chain from the
trailer dictionary to the given object. If the final part of the chain does
not exist, it is created as direct, nested, dictionaries. *)
val replace_chain : t -> string list -> pdfobject -> unit
(** Return the object number of an indirect dictionary object, if it is indirect. *)
val indirect_number : t -> string -> pdfobject -> int option
(** Same as [lookup_direct], but allow a second, alternative key. *)
val lookup_direct_orelse :
t -> string -> string -> pdfobject -> pdfobject option
(** Remove a dictionary entry, if it exists. *)
val remove_dict_entry : pdfobject -> string -> pdfobject
(** [replace_dict_entry dict key value] replaces a dictionary entry, raising [Not_found] if it's not there. *)
val replace_dict_entry : pdfobject -> string -> pdfobject -> pdfobject
(** [add_dict_entry dict key value] adds a dictionary entry, replacing if already there. *)
val add_dict_entry : pdfobject -> string -> pdfobject -> pdfobject
(** Make a PDF object direct -- that is, follow any indirect links. *)
val direct : t -> pdfobject -> pdfobject
(** Return the size of the object map. *)
val objcard : t -> int
(** Remove the given object *)
val removeobj : t -> int -> unit
(** Add an object. Returns the number chosen. *)
val addobj : t -> pdfobject -> int
(** Same as [addobj], but pick a number ourselves. *)
val addobj_given_num : t -> (int * pdfobject) -> unit
(** {2 Compound structures} *)
(** Parse a PDF rectangle structure into min x, min y, max x, max y. *)
val parse_rectangle : t -> pdfobject -> float * float * float * float
(** Calling [parse_matrix pdf name dict] parses a PDF matrix found under
key [name] in dictionary [dict] into a [Transform.transform_matrix]. If there is
no matrix, the identity matrix is returned. *)
val parse_matrix : t -> string -> pdfobject -> Pdftransform.transform_matrix
(** Build a matrix [pdfobject]. *)
val make_matrix : Pdftransform.transform_matrix -> pdfobject
(** Make a number of PDF documents contain no mutual object numbers. They can
then be merged etc. without clashes. *)
val renumber_pdfs : t list -> t list
(** Given a dictionary and a prefix (e.g gs), return a name, starting with the
prefix, which is not already in the dictionary (e.g /gs0). *)
val unique_key : string -> pdfobject -> string
(** {2 Iteration} *)
(** Iterate over the objects in a document. The iterating functions recieves both
object number and object from the object map. *)
val objiter : (int -> pdfobject -> unit) -> t -> unit
(** The same, but in object number order. *)
val objiter_inorder : (int -> pdfobject -> unit) -> t -> unit
(** Iterate over the objects in a document. The iterating functions recieves
object number, generation number and object from the object map. *)
val objiter_gen : (int -> int -> pdfobject -> unit) -> t -> unit
(** Map over all pdf objects in a document. Does not include trailer dictionary. *)
val objselfmap : (pdfobject -> pdfobject) -> t -> unit
(** Iterate over just the stream objects in a document. *)
val iter_stream : (pdfobject -> unit) -> t -> unit
(** {2 Garbage collection} *)
(** Garbage-collect a pdf document. *)
val remove_unreferenced : t -> unit
(** {2 Miscellaneous} *)
(** These functions were previsouly undocumented. They are documented here for
now, and in the future will be categorised more sensibly. *)
(** True if a character is PDF whitespace. *)
val is_whitespace : char -> bool
(** True if a character is not PDF whitespace. *)
val is_not_whitespace : char -> bool
(** True if a character is a PDF delimiter. *)
val is_delimiter : char -> bool
(** List, in order, the page reference numbers of a PDF's page tree. *)
val page_reference_numbers : t -> int list
(** List the object numbers in a PDF. *)
val objnumbers : t -> int list
(** Use the given function on each element of a PDF dictionary. *)
val recurse_dict :
(pdfobject -> pdfobject) -> (string * pdfobject) list -> pdfobject
(** Similarly for an [Array]. The function is applied to each element. *)
val recurse_array :
(pdfobject -> pdfobject) -> pdfobject list -> pdfobject
(** Calculate the changes required to renumber a PDF's objects 1..n. *)
val changes : t -> (int, int) Hashtbl.t
(** Perform the given renumberings on a PDF. *)
val renumber : (int, int) Hashtbl.t -> t -> t
(** Renumber an object given a change table. *)
val renumber_object_parsed : t -> (int, int) Hashtbl.t -> pdfobject -> pdfobject
(** Fetch a stream, if necessary, and return its contents (with no processing). *)
val bigarray_of_stream : pdfobject -> Pdfio.bytes
(** Make a objects entry from a parser and a list of (number, object) pairs. *)
val objects_of_list :
(int -> pdfobject) option -> (int * (objectdata ref * int)) list -> pdfobjects
(** Calling [objects_referenced no_follow_entries no_follow_contains pdf
pdfobject] find the objects reachable from the given object. Dictionary
keys in [no_follow_entries] are not explored. Dictionaries containing
entries in [no_follow_contains] are not explored. *)
val objects_referenced : string list -> (string * pdfobject) list -> t -> pdfobject -> int list
(** Generate and ID for a PDF document given its prospective file name (and
using the current date and time). If the file name is blank, the ID is
still likely to be unique, being based on date and time only. If
environment variable CAMLPDF_REPRODUCIBLE_IDS=true is set, the ID will instead
be set to a standard value. *)
val generate_id : t -> string -> (unit -> float) -> pdfobject
(** Return the document catalog. *)
val catalog_of_pdf : t -> pdfobject
(** Find the indirect reference given by the value associated with a key in a
dictionary. *)
val find_indirect : string -> pdfobject -> int option
(** Calling [nametree_lookup pdf k dict] looks up the name in the document's
name tree *)
val nametree_lookup : t -> pdfobject -> pdfobject -> pdfobject option
(** Return an ordered list of the key-value pairs in a given name tree. *)
val contents_of_nametree : t -> pdfobject -> (pdfobject * pdfobject) list
(** Copy a PDF data structure so that nothing is shared with the original. *)
val deep_copy : t -> t
(** Change the /ID string in a PDF's trailer dicfionary *)
val change_id : t -> string -> unit
(**/**)
(* This is only for the use of Pdfread for when the /Length is incorrect. *)
type toget_crypt =
| NoChange
| ToDecrypt of deferred_encryption
val length_of_toget : toget -> int
val input_of_toget : toget -> Pdfio.input
val position_of_toget : toget -> int
val toget : ?crypt:toget_crypt -> Pdfio.input -> int -> int -> toget
(* For inter-module recursion within CamlPDF, hence undocumented. *)
val string_of_pdf : (pdfobject -> string) ref
val transform_rect : t -> Pdftransform.transform_matrix -> pdfobject -> pdfobject
val transform_quadpoints : t -> Pdftransform.transform_matrix -> pdfobject -> pdfobject