Skip to content

Commit

Permalink
combine_pdfs command line script (#16)
Browse files Browse the repository at this point in the history
bonus `combine_pdfs` shell script to merge a bunch of PDFs into one; no change to core functionality.
  • Loading branch information
michelcrypt4d4mus authored Aug 29, 2024
1 parent ca574b7 commit 23aaeb9
Show file tree
Hide file tree
Showing 12 changed files with 281 additions and 6 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# NEXT RELEASE
* Add `combine_pdfs` command line script to merge a bunch of PDFs into one
* Remove unused `Deprecated` dependency

### 1.14.10
Expand Down
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ Things like, say, a hidden binary `/F` (PDF instruction meaning "URL") followed
-------------
# PDF Resources
## Included PDF Tools
The Pdfalyzer ships with a command line tool `combine_pdfs` that combines multiple PDFs into a single PDF. Run `combine_pdfs --help` to see the options.
## 3rd Party PDF Tools
### Installing Didier Stevens's PDF Analysis Tools
Expand All @@ -223,7 +225,7 @@ There's [a script](scripts/install_t1utils.sh) to help you install the suite if
scripts/install_t1utils.sh
```
## Documentation
## External Documentation
### Official Adobe Documentation
* [Official Adobe PDF 1.7 Specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf) - Indispensable map when navigating a PDF forest.
* [Adobe Type 1 Font Format Specification](https://adobe-type-tools.github.io/font-tech-notes/pdfs/T1_SPEC.pdf) - Official spec for Adobe's original font description language and file format. Useful if you have suspicions about malicious fonts. Type1 seems to be the attack vector of choice recently which isn't so surprising when you consider that it's a 30 year old technology and the code that renders these fonts probably hasn't been extensively tested in decades because almost no one uses them anymore outside of people who want to use them as attack vectors.
Expand Down Expand Up @@ -268,7 +270,12 @@ These are the naming conventions at play in The Pdfalyzer code base:
| **`indeterminate_node`** | any node whose place in the tree cannot be decided until every node has been seen |
| **`link_node`** | nodes like `/Dest` that just contain a pointer to another node |
### Reference
* [`PyPDF2 2.12.0` documentation](https://pypdf2.readthedocs.io/en/2.12.0/) (latest is 4.x or something so these are the relevant docs for `pdfalyze`)
# TODO
* Upgrade `PyPDF` to latest and expand `combine_pdfs` compression command line option
* Highlight decodes with a lot of Javascript keywords
* https://github.com/mandiant/flare-floss (https://github.com/mandiant/flare-floss/releases/download/v2.1.0/floss-v2.1.0-linux.zip)
* https://github.com/1Project/Scanr/blob/master/emulator/emulator.py
Expand Down
44 changes: 42 additions & 2 deletions pdfalyzer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import code
import logging
import sys
from os import environ, getcwd, path
from pathlib import Path

from dotenv import load_dotenv
# TODO: PdfMerger is deprecated in favor of PdfWriter at v3.9.1 (see https://pypdf.readthedocs.io/en/latest/user/merging-pdfs.html#basic-example)
from PyPDF2 import PdfMerger
from PyPDF2.errors import PdfReadError

# Should be first local import before load_dotenv() (or at least I think it needs to come first)
from pdfalyzer.config import PdfalyzerConfig

# load_dotenv() should be called as soon as possible (before parsing local classes) but not for pytest
Expand All @@ -16,16 +20,19 @@

from rich.columns import Columns
from rich.panel import Panel
from rich.text import Text
from yaralyzer.helpers.rich_text_helper import prefix_with_plain_text_obj
from yaralyzer.output.file_export import invoke_rich_export
from yaralyzer.output.rich_console import console
from yaralyzer.util.logging import log, log_and_print

from pdfalyzer.helpers.filesystem_helper import file_size_in_mb, set_max_open_files
from pdfalyzer.helpers.rich_text_helper import print_highlighted
from pdfalyzer.output.pdfalyzer_presenter import PdfalyzerPresenter
from pdfalyzer.output.styles.rich_theme import PDFALYZER_THEME_DICT
from pdfalyzer.pdfalyzer import Pdfalyzer
from pdfalyzer.util.argument_parser import ask_to_proceed, output_sections, parse_arguments, parse_combine_pdfs_args
from pdfalyzer.util.pdf_parser_manager import PdfParserManager
from pdfalyzer.util.argument_parser import output_sections, parse_arguments

# For the table shown by running pdfalyzer_show_color_theme
MAX_THEME_COL_SIZE = 35
Expand Down Expand Up @@ -82,3 +89,36 @@ def pdfalyzer_show_color_theme() -> None:
]

console.print(Columns(colors, column_first=True, padding=(0,3)))


def combine_pdfs():
"""Utility method to combine multiple PDFs into one. Invocable with 'combine_pdfs PDF1 [PDF2...]'."""
args = parse_combine_pdfs_args()
set_max_open_files(args.number_of_pdfs)
merger = PdfMerger()

for pdf in args.pdfs:
try:
print_highlighted(f" -> Merging '{pdf}'...", style='dim')
merger.append(pdf)
except PdfReadError as e:
print_highlighted(f" -> Failed to merge '{pdf}'! {e}", style='red')
ask_to_proceed()

if args.compression_level == 0:
print_highlighted("\nSkipping content stream compression...")
else:
print_highlighted(f"\nCompressing content streams with zlib level {args.compression_level}...")

for i, page in enumerate(merger.pages):
# TODO: enable image quality reduction + zlib level once PyPDF is upgraded to 4.x and option is available
# See https://pypdf.readthedocs.io/en/latest/user/file-size.html#reducing-image-quality
print_highlighted(f" -> Compressing page {i + 1}...", style='dim')
page.pagedata.compress_content_streams() # This is CPU intensive!

print_highlighted(f"\nWriting '{args.output_file}'...", style='cyan')
merger.write(args.output_file)
merger.close()
txt = Text('').append(f" -> Wrote ")
txt.append(str(file_size_in_mb(args.output_file)), style='cyan').append(" megabytes\n")
print_highlighted(txt)
102 changes: 102 additions & 0 deletions pdfalyzer/helpers/filesystem_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""
Some helpers for stuff with the local filesystem.
"""
import re
from pathlib import Path
from typing import Union

from yaralyzer.output.rich_console import console

from pdfalyzer.helpers.rich_text_helper import print_highlighted

NUMBERED_PAGE_REGEX = re.compile(r'.*_(\d+)\.\w{3,4}$')
DEFAULT_MAX_OPEN_FILES = 256 # macOS default
OPEN_FILES_BUFFER = 30 # we might have some files open already so we need to go beyond DEFAULT_MAX_OPEN_FILES
PDF_EXT = '.pdf'

# TODO: this kind of type alias is not supported until Python 3.12
#type StrOrPath = Union[str, Path]


def with_pdf_extension(file_path: Union[str, Path]) -> str:
"""Append '.pdf' to 'file_path' if it doesn't already end with '.pdf'."""
return str(file_path) + ('' if is_pdf(file_path) else PDF_EXT)


def is_pdf(file_path: Union[str, Path]) -> bool:
"""Return True if 'file_path' ends with '.pdf'."""
return str(file_path).endswith(PDF_EXT)


def file_exists(file_path: Union[str, Path]) -> bool:
"""Return True if 'file_path' exists."""
return Path(file_path).exists()


def do_all_files_exist(file_paths: list[Union[str, Path]]) -> bool:
"""Print an error for each element of 'file_paths' that's not a file. Return True if all 'file_paths' exist."""
all_files_exist = True

for file_path in file_paths:
if not file_exists(file_path):
console.print(f"File not found: '{file_path}'", style='error')
all_files_exist = False

return all_files_exist


def extract_page_number(file_path: Union[str, Path]) -> int|None:
"""Extract the page number from the end of a filename if it exists."""
match = NUMBERED_PAGE_REGEX.match(str(file_path))
return int(match.group(1)) if match else None


def file_size_in_mb(file_path: Union[str, Path], decimal_places: int = 2) -> float:
"""Return the size of 'file_path' in MB rounded to 2 decimal places,"""
return round(Path(file_path).stat().st_size / 1024.0 / 1024.0, decimal_places)


def set_max_open_files(num_filehandles: int = DEFAULT_MAX_OPEN_FILES) -> tuple[int | None, int | None]:
"""
Sets the OS level max open files to at least 'num_filehandles'. Current value can be seen with 'ulimit -a'.
Required when you might be opening more than DEFAULT_MAX_OPEN_FILES file handles simultaneously
(e.g. when you are merging a lot of small images or PDFs). Equivalent of something like
'default ulimit -n 1024' on macOS.
NOTE: Does nothing on Windows (I think).
NOTE: This mostly came from somewhere on stackoverflow but I lost the link.
"""
try:
import resource # Windows doesn't have this package / doesn't need to bump up the ulimit (??)
except ImportError:
resource = None

if resource is None:
print_highlighted(f"No resource module; cannot set max open files on this platform...", style='yellow')
return (None, None)
elif num_filehandles <= DEFAULT_MAX_OPEN_FILES:
# Then the OS max open files value is already sufficient.
return (DEFAULT_MAX_OPEN_FILES, DEFAULT_MAX_OPEN_FILES)

# %% (0) what is current ulimit -n setting?
(soft, hard) = resource.getrlimit(resource.RLIMIT_NOFILE)
num_filehandles = num_filehandles + OPEN_FILES_BUFFER

# %% (1) increase limit (soft and even hard) if needed
if soft < num_filehandles:
soft = num_filehandles
hard = max(soft, hard)
print_highlighted(f"Increasing max open files soft & hard 'ulimit -n {soft} {hard}'...")

try:
resource.setrlimit(resource.RLIMIT_NOFILE, (soft, hard))
except (ValueError, resource.error):
try:
hard = soft
print_highlighted(f"Retrying setting max open files (soft, hard)=({soft}, {hard})", style='yellow')
resource.setrlimit(resource.RLIMIT_NOFILE, (soft, hard))
except Exception:
print_highlighted('Failed to set max open files / ulimit, giving up!', style='error')
soft,hard = resource.getrlimit(resource.RLIMIT_NOFILE)

return (soft, hard)
12 changes: 12 additions & 0 deletions pdfalyzer/helpers/rich_text_helper.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
"""
Functions for miscellaneous Rich text/string operations.
"""
from functools import partial
from typing import List

from PyPDF2.generic import PdfObject
from rich.console import Console
from rich.highlighter import RegexHighlighter, JSONHighlighter
from rich.text import Text
from yaralyzer.output.rich_console import console

from pdfalyzer.helpers.pdf_object_helper import pypdf_class_name
from pdfalyzer.output.styles.node_colors import get_label_style, get_class_style_italic

# Usually we use the yaralyzer console but that has no highlighter
pdfalyzer_console = Console(color_system='256')


def print_highlighted(msg: str|Text, **kwargs) -> None:
"""Print 'msg' with Rich highlighting."""
pdfalyzer_console.print(msg, highlight=True, **kwargs)


def quoted_text(
_string: str,
Expand Down
79 changes: 77 additions & 2 deletions pdfalyzer/util/argument_parser.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
import sys
from argparse import ArgumentError, ArgumentParser
from argparse import ArgumentError, ArgumentParser, Namespace
from collections import namedtuple
from functools import partial, update_wrapper
from importlib.metadata import version
from os import getcwd, path
from typing import List

from rich_argparse_plus import RichHelpFormatterPlus
from rich.prompt import Confirm
from rich.text import Text
from yaralyzer.util.argument_parser import export, parser, parse_arguments as parse_yaralyzer_args
from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_current_config, log_invocation

from pdfalyzer.config import ALL_STREAMS, PdfalyzerConfig
from pdfalyzer.detection.constants.binary_regexes import QUOTE_PATTERNS
from pdfalyzer.helpers.filesystem_helper import (do_all_files_exist, extract_page_number, file_exists, is_pdf,
with_pdf_extension)
from pdfalyzer.helpers.rich_text_helper import print_highlighted

# NamedTuple to keep our argument selection orderly
OutputSection = namedtuple('OutputSection', ['argument', 'method'])
Expand Down Expand Up @@ -107,7 +112,9 @@
parser._action_groups = parser._action_groups[:2] + [parser._action_groups[-1]] + parser._action_groups[2:-1]


# The Parsening Begins
################################
# Main argument parsing begins #
################################
def parse_arguments():
"""Parse command line args. Most settings are communicated to the app by setting env vars"""
if '--version' in sys.argv:
Expand Down Expand Up @@ -175,3 +182,71 @@ def output_sections(args, pdfalyzer) -> List[OutputSection]:
def all_sections_chosen(args):
"""Returns true if all flags are set or no flags are set."""
return len([s for s in ALL_SECTIONS if vars(args)[s]]) == len(ALL_SECTIONS)


###############################################
# Separate arg parser for combine_pdfs script #
###############################################
combine_pdfs_parser = ArgumentParser(
description="Combine multiple PDFs into one.",
epilog="If all PDFs end in a number (e.g. 'xyz_1.pdf', 'xyz_2.pdf', etc. sort the files as if those were" \
" page numebrs prior to merging.",
formatter_class=RichHelpFormatterPlus)

combine_pdfs_parser.add_argument('pdfs',
help='two or more PDFs to combine',
metavar='PDF_PATH',
nargs='+')

combine_pdfs_parser.add_argument('-c', '--compression-level',
help='zlib image compression level (0=none, max=1 until PyPDF is upgraded)',
choices=range(0, 2),
default=1,
type=int)

combine_pdfs_parser.add_argument('-o', '--output-file',
help='path to write the combined PDFs to',
required=True)


def parse_combine_pdfs_args() -> Namespace:
"""Parse command line args for combine_pdfs script."""
args = combine_pdfs_parser.parse_args()
args.output_file = with_pdf_extension(args.output_file)
confirm_overwrite_txt = Text("Overwrite '").append(args.output_file, style='cyan').append("'?")
args.number_of_pdfs = len(args.pdfs)

if args.number_of_pdfs < 2:
exit_with_error(f"Need at least 2 PDFs to merge.")
elif not do_all_files_exist(args.pdfs):
exit_with_error()
elif file_exists(args.output_file) and not Confirm.ask(confirm_overwrite_txt):
exit_with_error()

if all(is_pdf(pdf) for pdf in args.pdfs):
if all(extract_page_number(pdf) for pdf in args.pdfs):
print_highlighted("PDFs appear to have page number suffixes so sorting numerically...")
args.pdfs.sort(key=lambda pdf: extract_page_number(pdf))
else:
print_highlighted("PDFs don't seem to end in page numbers so using provided order...", style='yellow')
else:
print_highlighted("WARNING: At least one of the PDF args doesn't end in '.pdf'", style='bright_yellow')
ask_to_proceed()

print_highlighted(f"\nMerging {args.number_of_pdfs} individual PDFs into '{args.output_file}'...")
return args


def ask_to_proceed() -> None:
"""Exit if user doesn't confirm they want to proceed."""
if not Confirm.ask(Text("Proceed anyway?")):
exit_with_error()


def exit_with_error(error_message: str|None = None) -> None:
"""Print 'error_message' and exit with status code 1."""
if error_message:
print_highlighted(error_message, style='bold red')

print_highlighted('Exiting...', style='red')
sys.exit(1)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ packages = [


[tool.poetry.scripts]
combine_pdfs = 'pdfalyzer:combine_pdfs'
pdfalyze = 'pdfalyzer:pdfalyze'
pdfalyzer_show_color_theme = 'pdfalyzer:pdfalyzer_show_color_theme'

Expand Down
1 change: 0 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from pdfalyzer.pdfalyzer import Pdfalyzer


PROJECT_DIR = path.join(str(importlib.resources.files('pdfalyzer')), pardir)
DOCUMENTATION_DIR = path.join(PROJECT_DIR, 'doc')
SVG_DIR = path.join(DOCUMENTATION_DIR, 'svgs')
Expand Down
Binary file added tests/fixtures/one_page_pdfs/page_1.pdf
Binary file not shown.
Binary file added tests/fixtures/one_page_pdfs/page_2.pdf
Binary file not shown.
Binary file added tests/fixtures/one_page_pdfs/page_3.pdf
Binary file not shown.
Loading

0 comments on commit 23aaeb9

Please sign in to comment.