combine_pdfs command line script (#16)

bonus `combine_pdfs` shell script to merge a bunch of PDFs into one; no change to core functionality.
michelcrypt4d4mus · Aug 29, 2024 · 23aaeb9 · 23aaeb9
1 parent ca574b7
commit 23aaeb9
Show file tree

Hide file tree

Showing 12 changed files with 281 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,5 @@
 # NEXT RELEASE
+* Add `combine_pdfs` command line script to merge a bunch of PDFs into one
 * Remove unused `Deprecated` dependency
 
 ### 1.14.10

diff --git a/README.md b/README.md
@@ -201,6 +201,8 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
 -------------
 
 # PDF Resources
+## Included PDF Tools
+The Pdfalyzer ships with a command line tool `combine_pdfs` that combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
 
 ## 3rd Party PDF Tools
 ### Installing Didier Stevens's PDF Analysis Tools
@@ -223,7 +225,7 @@ There's [a script](scripts/install_t1utils.sh) to help you install the suite if
 scripts/install_t1utils.sh
 ```
 
-## Documentation
+## External Documentation
 ### Official Adobe Documentation
 * [Official Adobe PDF 1.7 Specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf) - Indispensable map when navigating a PDF forest.
 * [Adobe Type 1 Font Format Specification](https://adobe-type-tools.github.io/font-tech-notes/pdfs/T1_SPEC.pdf) - Official spec for Adobe's original font description language and file format. Useful if you have suspicions about malicious fonts. Type1 seems to be the attack vector of choice recently which isn't so surprising when you consider that it's a 30 year old technology and the code that renders these fonts probably hasn't been extensively tested in decades because almost no one uses them anymore outside of people who want to use them as attack vectors.
@@ -268,7 +270,12 @@ These are the naming conventions at play in The Pdfalyzer code base:
 | **`indeterminate_node`** | any node whose place in the tree cannot be decided until every node has been seen |
 | **`link_node`** | nodes like `/Dest` that just contain a pointer to another node |
 
+### Reference
+* [`PyPDF2 2.12.0` documentation](https://pypdf2.readthedocs.io/en/2.12.0/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
+
+
 # TODO
+* Upgrade `PyPDF` to latest and expand `combine_pdfs` compression command line option
 * Highlight decodes with a lot of Javascript keywords
 * https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
 * https://github.com/1Project/Scanr/blob/master/emulator/emulator.py

diff --git a/pdfalyzer/__init__.py b/pdfalyzer/__init__.py
@@ -1,10 +1,14 @@
 import code
-import logging
 import sys
 from os import environ, getcwd, path
+from pathlib import Path
 
 from dotenv import load_dotenv
+# TODO: PdfMerger is deprecated in favor of PdfWriter at v3.9.1 (see https://pypdf.readthedocs.io/en/latest/user/merging-pdfs.html#basic-example)
+from PyPDF2 import PdfMerger
+from PyPDF2.errors import PdfReadError
 
+# Should be first local import before load_dotenv() (or at least I think it needs to come first)
 from pdfalyzer.config import PdfalyzerConfig
 
 # load_dotenv() should be called as soon as possible (before parsing local classes) but not for pytest
@@ -16,16 +20,19 @@
 
 from rich.columns import Columns
 from rich.panel import Panel
+from rich.text import Text
 from yaralyzer.helpers.rich_text_helper import prefix_with_plain_text_obj
 from yaralyzer.output.file_export import invoke_rich_export
 from yaralyzer.output.rich_console import console
 from yaralyzer.util.logging import log, log_and_print
 
+from pdfalyzer.helpers.filesystem_helper import file_size_in_mb, set_max_open_files
+from pdfalyzer.helpers.rich_text_helper import print_highlighted
 from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
 from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
 from pdfalyzer.pdfalyzer import Pdfalyzer
+from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments, parse_combine_pdfs_args
 from pdfalyzer.util.pdf_parser_manager import PdfParserManager
-from pdfalyzer.util.argument_parser import output_sections, parse_arguments
 
 # For the table shown by running pdfalyzer_show_color_theme
 MAX_THEME_COL_SIZE = 35
@@ -82,3 +89,36 @@ def pdfalyzer_show_color_theme() -> None:
     ]
 
     console.print(Columns(colors, column_first=True, padding=(0,3)))
+
+
+def combine_pdfs():
+    """Utility method to combine multiple PDFs into one. Invocable with 'combine_pdfs PDF1 [PDF2...]'."""
+    args = parse_combine_pdfs_args()
+    set_max_open_files(args.number_of_pdfs)
+    merger = PdfMerger()
+
+    for pdf in args.pdfs:
+        try:
+            print_highlighted(f"  -> Merging '{pdf}'...", style='dim')
+            merger.append(pdf)
+        except PdfReadError as e:
+            print_highlighted(f"      -> Failed to merge '{pdf}'! {e}", style='red')
+            ask_to_proceed()
+
+    if args.compression_level == 0:
+        print_highlighted("\nSkipping content stream compression...")
+    else:
+        print_highlighted(f"\nCompressing content streams with zlib level {args.compression_level}...")
+
+        for i, page in enumerate(merger.pages):
+            # TODO: enable image quality reduction + zlib level once PyPDF is upgraded to 4.x and option is available
+            # See https://pypdf.readthedocs.io/en/latest/user/file-size.html#reducing-image-quality
+            print_highlighted(f"  -> Compressing page {i + 1}...", style='dim')
+            page.pagedata.compress_content_streams()  # This is CPU intensive!
+
+    print_highlighted(f"\nWriting '{args.output_file}'...", style='cyan')
+    merger.write(args.output_file)
+    merger.close()
+    txt = Text('').append(f"  -> Wrote ")
+    txt.append(str(file_size_in_mb(args.output_file)), style='cyan').append(" megabytes\n")
+    print_highlighted(txt)
diff --git a/pdfalyzer/helpers/filesystem_helper.py b/pdfalyzer/helpers/filesystem_helper.py
@@ -0,0 +1,102 @@
+"""
+Some helpers for stuff with the local filesystem.
+"""
+import re
+from pathlib import Path
+from typing import Union
+
+from yaralyzer.output.rich_console import console
+
+from pdfalyzer.helpers.rich_text_helper import print_highlighted
+
+NUMBERED_PAGE_REGEX = re.compile(r'.*_(\d+)\.\w{3,4}$')
+DEFAULT_MAX_OPEN_FILES = 256  # macOS default
+OPEN_FILES_BUFFER = 30        # we might have some files open already so we need to go beyond DEFAULT_MAX_OPEN_FILES
+PDF_EXT = '.pdf'
+
+# TODO: this kind of type alias is not supported until Python 3.12
+#type StrOrPath = Union[str, Path]
+
+
+def with_pdf_extension(file_path: Union[str, Path]) -> str:
+    """Append '.pdf' to 'file_path' if it doesn't already end with '.pdf'."""
+    return str(file_path) + ('' if is_pdf(file_path) else PDF_EXT)
+
+
+def is_pdf(file_path: Union[str, Path]) -> bool:
+    """Return True if 'file_path' ends with '.pdf'."""
+    return str(file_path).endswith(PDF_EXT)
+
+
+def file_exists(file_path: Union[str, Path]) -> bool:
+    """Return True if 'file_path' exists."""
+    return Path(file_path).exists()
+
+
+def do_all_files_exist(file_paths: list[Union[str, Path]]) -> bool:
+    """Print an error for each element of 'file_paths' that's not a file. Return True if all 'file_paths' exist."""
+    all_files_exist = True
+
+    for file_path in file_paths:
+        if not file_exists(file_path):
+            console.print(f"File not found: '{file_path}'", style='error')
+            all_files_exist = False
+
+    return all_files_exist
+
+
+def extract_page_number(file_path: Union[str, Path]) -> int|None:
+    """Extract the page number from the end of a filename if it exists."""
+    match = NUMBERED_PAGE_REGEX.match(str(file_path))
+    return int(match.group(1)) if match else None
+
+
+def file_size_in_mb(file_path: Union[str, Path], decimal_places: int = 2) -> float:
+    """Return the size of 'file_path' in MB rounded to 2 decimal places,"""
+    return round(Path(file_path).stat().st_size / 1024.0 / 1024.0, decimal_places)
+
+
+def set_max_open_files(num_filehandles: int = DEFAULT_MAX_OPEN_FILES) -> tuple[int | None, int | None]:
+    """
+    Sets the OS level max open files to at least 'num_filehandles'. Current value can be seen with 'ulimit -a'.
+    Required when you might be opening more than DEFAULT_MAX_OPEN_FILES file handles simultaneously
+    (e.g. when you are merging a lot of small images or PDFs). Equivalent of something like
+    'default ulimit -n 1024' on macOS.
+
+    NOTE: Does nothing on Windows (I think).
+    NOTE: This mostly came from somewhere on stackoverflow but I lost the link.
+    """
+    try:
+        import resource  # Windows doesn't have this package / doesn't need to bump up the ulimit (??)
+    except ImportError:
+        resource = None
+
+    if resource is None:
+        print_highlighted(f"No resource module; cannot set max open files on this platform...", style='yellow')
+        return (None, None)
+    elif num_filehandles <= DEFAULT_MAX_OPEN_FILES:
+        # Then the OS max open files value is already sufficient.
+        return (DEFAULT_MAX_OPEN_FILES, DEFAULT_MAX_OPEN_FILES)
+
+    # %% (0) what is current ulimit -n setting?
+    (soft, hard) = resource.getrlimit(resource.RLIMIT_NOFILE)
+    num_filehandles = num_filehandles + OPEN_FILES_BUFFER
+
+    # %% (1) increase limit (soft and even hard) if needed
+    if soft < num_filehandles:
+        soft = num_filehandles
+        hard = max(soft, hard)
+        print_highlighted(f"Increasing max open files soft & hard 'ulimit -n {soft} {hard}'...")
+
+        try:
+            resource.setrlimit(resource.RLIMIT_NOFILE, (soft, hard))
+        except (ValueError, resource.error):
+            try:
+               hard = soft
+               print_highlighted(f"Retrying setting max open files (soft, hard)=({soft}, {hard})", style='yellow')
+               resource.setrlimit(resource.RLIMIT_NOFILE, (soft, hard))
+            except Exception:
+               print_highlighted('Failed to set max open files / ulimit, giving up!', style='error')
+               soft,hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+
+    return (soft, hard)
diff --git a/pdfalyzer/helpers/rich_text_helper.py b/pdfalyzer/helpers/rich_text_helper.py
@@ -1,14 +1,26 @@
 """
 Functions for miscellaneous Rich text/string operations.
 """
+from functools import partial
 from typing import List
 
 from PyPDF2.generic import PdfObject
+from rich.console import Console
+from rich.highlighter import RegexHighlighter, JSONHighlighter
 from rich.text import Text
+from yaralyzer.output.rich_console import console
 
 from pdfalyzer.helpers.pdf_object_helper import pypdf_class_name
 from pdfalyzer.output.styles.node_colors import get_label_style, get_class_style_italic
 
+# Usually we use the yaralyzer console but that has no highlighter
+pdfalyzer_console = Console(color_system='256')
+
+
+def print_highlighted(msg: str|Text, **kwargs) -> None:
+    """Print 'msg' with Rich highlighting."""
+    pdfalyzer_console.print(msg, highlight=True, **kwargs)
+
 
 def quoted_text(
         _string: str,

diff --git a/pdfalyzer/util/argument_parser.py b/pdfalyzer/util/argument_parser.py
@@ -1,17 +1,22 @@
 import sys
-from argparse import ArgumentError, ArgumentParser
+from argparse import ArgumentError, ArgumentParser, Namespace
 from collections import namedtuple
 from functools import partial, update_wrapper
 from importlib.metadata import version
 from os import getcwd, path
 from typing import List
 
 from rich_argparse_plus import RichHelpFormatterPlus
+from rich.prompt import Confirm
+from rich.text import Text
 from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args
 from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_current_config, log_invocation
 
 from pdfalyzer.config import ALL_STREAMS, PdfalyzerConfig
 from pdfalyzer.detection.constants.binary_regexes import QUOTE_PATTERNS
+from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
+     with_pdf_extension)
+from pdfalyzer.helpers.rich_text_helper import print_highlighted
 
 # NamedTuple to keep our argument selection orderly
 OutputSection = namedtuple('OutputSection', ['argument', 'method'])
@@ -107,7 +112,9 @@
 parser._action_groups = parser._action_groups[:2] + [parser._action_groups[-1]] + parser._action_groups[2:-1]
 
 
-# The Parsening Begins
+################################
+# Main argument parsing begins #
+################################
 def parse_arguments():
     """Parse command line args. Most settings are communicated to the app by setting env vars"""
     if '--version' in sys.argv:
@@ -175,3 +182,71 @@ def output_sections(args, pdfalyzer) -> List[OutputSection]:
 def all_sections_chosen(args):
     """Returns true if all flags are set or no flags are set."""
     return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)
+
+
+###############################################
+# Separate arg parser for combine_pdfs script #
+###############################################
+combine_pdfs_parser = ArgumentParser(
+    description="Combine multiple PDFs into one.",
+    epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" \
+           " page numebrs prior to merging.",
+    formatter_class=RichHelpFormatterPlus)
+
+combine_pdfs_parser.add_argument('pdfs',
+                                 help='two or more PDFs to combine',
+                                 metavar='PDF_PATH',
+                                 nargs='+')
+
+combine_pdfs_parser.add_argument('-c', '--compression-level',
+                                 help='zlib image compression level (0=none, max=1 until PyPDF is upgraded)',
+                                 choices=range(0, 2),
+                                 default=1,
+                                 type=int)
+
+combine_pdfs_parser.add_argument('-o', '--output-file',
+                                 help='path to write the combined PDFs to',
+                                 required=True)
+
+
+def parse_combine_pdfs_args() -> Namespace:
+    """Parse command line args for combine_pdfs script."""
+    args = combine_pdfs_parser.parse_args()
+    args.output_file = with_pdf_extension(args.output_file)
+    confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
+    args.number_of_pdfs = len(args.pdfs)
+
+    if args.number_of_pdfs < 2:
+        exit_with_error(f"Need at least 2 PDFs to merge.")
+    elif not do_all_files_exist(args.pdfs):
+        exit_with_error()
+    elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
+        exit_with_error()
+
+    if all(is_pdf(pdf) for pdf in args.pdfs):
+        if all(extract_page_number(pdf) for pdf in args.pdfs):
+            print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
+            args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
+        else:
+            print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
+    else:
+        print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
+        ask_to_proceed()
+
+    print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
+    return args
+
+
+def ask_to_proceed() -> None:
+    """Exit if user doesn't confirm they want to proceed."""
+    if not Confirm.ask(Text("Proceed anyway?")):
+        exit_with_error()
+
+
+def exit_with_error(error_message: str|None = None) -> None:
+    """Print 'error_message' and exit with status code 1."""
+    if error_message:
+        print_highlighted(error_message, style='bold red')
+
+    print_highlighted('Exiting...', style='red')
+    sys.exit(1)
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,7 @@ packages = [
 
 
 [tool.poetry.scripts]
+combine_pdfs = 'pdfalyzer:combine_pdfs'
 pdfalyze = 'pdfalyzer:pdfalyze'
 pdfalyzer_show_color_theme = 'pdfalyzer:pdfalyzer_show_color_theme'
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -8,7 +8,6 @@
 
 from pdfalyzer.pdfalyzer import Pdfalyzer
 
-
 PROJECT_DIR = path.join(str(importlib.resources.files('pdfalyzer')), pardir)
 DOCUMENTATION_DIR = path.join(PROJECT_DIR, 'doc')
 SVG_DIR = path.join(DOCUMENTATION_DIR, 'svgs')

diff --git a/tests/fixtures/one_page_pdfs/page_1.pdf b/tests/fixtures/one_page_pdfs/page_1.pdf
diff --git a/tests/fixtures/one_page_pdfs/page_2.pdf b/tests/fixtures/one_page_pdfs/page_2.pdf
diff --git a/tests/fixtures/one_page_pdfs/page_3.pdf b/tests/fixtures/one_page_pdfs/page_3.pdf