From c76be9ea9b99e8307f7ea9b4b83f458d15d4c7e3 Mon Sep 17 00:00:00 2001 From: Gregory Szorc Date: Fri, 16 Oct 2020 21:08:15 -0700 Subject: [PATCH] cli: add a find-resources command This seems like a useful debugging tool to help triage issues with our custom resource scanner. The output could be improved. But this seems like a good enough start. --- docs/history.rst | 4 ++ docs/managing_projects.rst | 51 ++++++++++++++ docs/packaging_pitfalls.rst | 18 +++++ pyoxidizer/src/cli.rs | 74 +++++++++++++++++++ pyoxidizer/src/projectmgmt.rs | 129 ++++++++++++++++++++++++++++++++-- 5 files changed, 272 insertions(+), 4 deletions(-) diff --git a/docs/history.rst b/docs/history.rst index 14ca98fb8..200a6bfb5 100644 --- a/docs/history.rst +++ b/docs/history.rst @@ -48,6 +48,10 @@ New Features * A ``print(*args)`` function is now exposed to Starlark. This function is documented as a Starlark built-in but isn't provided by the Rust Starlark implementation by default. So we've implemented it ourselves. (#292) +* The new ``pyoxidizer find-resources`` command can be used to invoke + PyOxidizer's code for scanning files for resources. This command can be + used to debug and triage bugs related to PyOxidizer's custom code for + finding and handling resources. Bug Fixes ^^^^^^^^^ diff --git a/docs/managing_projects.rst b/docs/managing_projects.rst index 3b9107460..4bcdd3016 100644 --- a/docs/managing_projects.rst +++ b/docs/managing_projects.rst @@ -189,3 +189,54 @@ are governed by the X11, and GPL-3.0 licenses:: can be wrong. They do not constitute a legal promise. Paranoid individuals may want to double check the license annotations by verifying with source code distributions, for example. + +.. _cli_find_resources: + +Debugging Resource Scanning and Identification with ``find-resources`` +====================================================================== + +The ``pyoxidizer find-resources`` command can be used to scan for +resources in a given source and then print information on what's found. + +PyOxidizer's packaging functionality scans directories and files and +classifies them as Python resources which can be operated on. See +:ref:`packaging_resource_types`. PyOxidizer's run-time importer/loader +(:ref:`oxidized_importer`) works by reading a pre-built index of known +resources. This all works in contrast to how Python typically works, +which is to put a bunch of files in directories and let the built-in +importer/loader figure it out by dynamically probing for various files. + +Because PyOxidizer has introduced structure where it doesn't exist +in Python and because there are many subtle nuances with how files +are classified, there can be bugs in PyOxidizer's resource scanning +code. + +The ``pyoxidizer find-resources`` command exists to facilitate +debugging PyOxidizer's resource scanning code. + +Simply give the command a path to a directory or Python wheel archive +and it will tell you what it discovers. e.g.:: + + $ pyoxidizer find-resources dist/oxidized_importer-0.1-cp38-cp38-manylinux1_x86_64.whl + parsing dist/oxidized_importer-0.1-cp38-cp38-manylinux1_x86_64.whl as a wheel archive + PythonExtensionModule { name: oxidized_importer } + PythonPackageDistributionResource { package: oxidized-importer, version: 0.1, name: LICENSE } + PythonPackageDistributionResource { package: oxidized-importer, version: 0.1, name: WHEEL } + PythonPackageDistributionResource { package: oxidized-importer, version: 0.1, name: top_level.txt } + PythonPackageDistributionResource { package: oxidized-importer, version: 0.1, name: METADATA } + PythonPackageDistributionResource { package: oxidized-importer, version: 0.1, name: RECORD } + +Or give it the path to a ``site-packages`` directory:: + + $ pyoxidizer find-resources ~/.pyenv/versions/3.8.6/lib/python3.8/site-packages + ... + +This command needs to use a Python distribution so it knows what file +extensions correspond to Python extensions, etc. By default, it will +download one of the +:ref:`built-in distributions ` that is +compatible with the current machine and use that. You can specify a +``--distributions-dir`` to use to cache downloaded distributions:: + + $ pyoxidizer find-resources --distributions-dir distributions /usr/lib/python3.8 + ... diff --git a/docs/packaging_pitfalls.rst b/docs/packaging_pitfalls.rst index b5da07037..3eb61dff8 100644 --- a/docs/packaging_pitfalls.rst +++ b/docs/packaging_pitfalls.rst @@ -41,3 +41,21 @@ like so:: if getattr(sys, 'oxidized', False): print('running in PyOxidizer!') + +.. _pitfall_incorrect_resource_identification: + +Incorrect Resource Identification +================================= + +PyOxidizer has custom code for scanning for and indexing files as specific +Python resource types. This code is somewhat complex and nuanced and there +are known bugs that will cause PyOxidizer to fail to identify or classify a +file appropriately. + +To help debug problems with this code, the ``pyoxidizer find-resources`` +command can be employed. See :ref:`cli_find_resources` for more. + +.. important:: + + Please `file a bug `_ + to report problems! diff --git a/pyoxidizer/src/cli.rs b/pyoxidizer/src/cli.rs index 2b1b90221..c09f63d55 100644 --- a/pyoxidizer/src/cli.rs +++ b/pyoxidizer/src/cli.rs @@ -72,6 +72,25 @@ This command executes the functionality to derive various artifacts and emits special lines that tell the Rust build system how to consume them. "; +const RESOURCES_SCAN_ABOUT: &str = "\ +Scan a directory or file for Python resources. + +This command invokes the logic used by various PyOxidizer functionality +walking a directory tree or parsing a file and categorizing seen files. + +The directory walking functionality is used by +`oxidized_importer.find_resources_in_path()` and Starlark methods like +`PythonExecutable.pip_install()` and +`PythonExecutable.read_package_root()`. + +The file parsing logic is used for parsing the contents of wheels. + +This command can be used to debug failures with PyOxidizer's code +for converting files/directories into strongly typed objects. This +conversion is critical for properly packaging Python applications and +bugs can result in incorrect install layouts, missing resources, etc. +"; + pub fn run_cli() -> Result<()> { let env = crate::environment::resolve_environment()?; @@ -121,6 +140,34 @@ pub fn run_cli() -> Result<()> { .help("The config file target to resolve"), ), ) + .subcommand( + SubCommand::with_name("find-resources") + .about("Find resources in a file or directory") + .long_about(RESOURCES_SCAN_ABOUT) + .setting(AppSettings::ArgRequiredElseHelp) + .arg( + Arg::with_name("distributions_dir") + .long("distributions-dir") + .takes_value(true) + .value_name("PATH") + .help("Directory to extract downloaded Python distributions into"), + ) + .arg( + Arg::with_name("scan_distribution") + .long("--scan-distribution") + .help("Scan the Python distribution instead of a path"), + ) + .arg( + Arg::with_name("target_triple") + .long("target-triple") + .takes_value(true) + .default_value(env!("HOST")) + .help("Target triple of Python distribution to use"), + ) + .arg(Arg::with_name("path").value_name("PATH").help( + "Filesystem path to scan for resources. Must be a directory or Python wheel", + )), + ) .subcommand( SubCommand::with_name("init-config-file") .setting(AppSettings::ArgRequiredElseHelp) @@ -313,6 +360,33 @@ pub fn run_cli() -> Result<()> { ) } + ("find-resources", Some(args)) => { + let path = if let Some(value) = args.value_of("path") { + Some(Path::new(value)) + } else { + None + }; + let distributions_dir = if let Some(value) = args.value_of("distributions_dir") { + Some(Path::new(value)) + } else { + None + }; + let scan_distribution = args.is_present("scan_distribution"); + let target_triple = args.value_of("target_triple").unwrap(); + + if path.is_none() && !scan_distribution { + Err(anyhow!("must specify a path or --scan-distribution")) + } else { + projectmgmt::find_resources( + &logger_context.logger, + path, + distributions_dir, + scan_distribution, + target_triple, + ) + } + } + ("init-config-file", Some(args)) => { let code = args.value_of("python-code"); let pip_install = if args.is_present("pip-install") { diff --git a/pyoxidizer/src/projectmgmt.rs b/pyoxidizer/src/projectmgmt.rs index c79943458..603781d9f 100644 --- a/pyoxidizer/src/projectmgmt.rs +++ b/pyoxidizer/src/projectmgmt.rs @@ -7,12 +7,22 @@ use { crate::project_building::find_pyoxidizer_config_file_env, crate::project_layout::{initialize_project, write_new_pyoxidizer_config_file}, - crate::py_packaging::standalone_distribution::StandaloneDistribution, + crate::py_packaging::{ + distribution::{default_distribution_location, resolve_distribution, DistributionFlavor}, + standalone_distribution::StandaloneDistribution, + }, crate::starlark::eval::{eval_starlark_config_file, EvalResult}, anyhow::{anyhow, Result}, - std::fs::create_dir_all, - std::io::{Cursor, Read}, - std::path::Path, + python_packaging::{ + filesystem_scanning::find_python_resources, + resource::{DataLocation, PythonResource}, + wheel::WheelArchive, + }, + std::{ + fs::create_dir_all, + io::{Cursor, Read}, + path::Path, + }, }; /// Attempt to resolve the default Rust target for a build. @@ -146,6 +156,117 @@ pub fn run( res.context.run_target(target) } +/// Find resources given a source path. +pub fn find_resources( + logger: &slog::Logger, + path: Option<&Path>, + distributions_dir: Option<&Path>, + scan_distribution: bool, + target_triple: &str, +) -> Result<()> { + let distribution_location = + default_distribution_location(&DistributionFlavor::Standalone, target_triple, None)?; + + let mut temp_dir = None; + + let extract_path = if let Some(path) = distributions_dir { + path + } else { + temp_dir.replace(tempdir::TempDir::new("python-distribution")?); + temp_dir.as_ref().unwrap().path() + }; + + let dist = resolve_distribution(logger, &distribution_location, extract_path)?; + + if scan_distribution { + println!("scanning distribution"); + + for ext in dist.iter_extension_modules() { + print_resource(&PythonResource::from(ext)); + } + for source in dist.source_modules()? { + print_resource(&PythonResource::from(source)); + } + for data in dist.resource_datas()? { + print_resource(&PythonResource::from(data)); + } + } else if let Some(path) = path { + if path.is_dir() { + println!("scanning directory {}", path.display()); + for resource in + find_python_resources(path, dist.cache_tag(), &dist.python_module_suffixes()?) + { + print_resource(&resource?); + } + } else if path.is_file() { + if let Some(extension) = path.extension() { + if extension.to_string_lossy() == "whl" { + println!("parsing {} as a wheel archive", path.display()); + let wheel = WheelArchive::from_path(path)?; + + for resource in + wheel.python_resources(dist.cache_tag(), &dist.python_module_suffixes()?)? + { + print_resource(&resource) + } + + return Ok(()); + } + } + + println!("do not know how to find resources in {}", path.display()); + } else { + println!("do not know how to find resources in {}", path.display()); + } + } else { + println!("do not know what to scan"); + } + + Ok(()) +} + +fn print_resource(r: &PythonResource) { + match r { + PythonResource::ModuleSource(m) => println!( + "PythonModuleSource {{ name: {}, is_package: {}, is_stdlib: {}, is_test: {} }}", + m.name, m.is_package, m.is_stdlib, m.is_test + ), + PythonResource::ModuleBytecode(m) => println!( + "PythonModuleBytecode {{ name: {}, is_package: {}, is_stdlib: {}, is_test: {}, bytecode_level: {} }}", + m.name, m.is_package, m.is_stdlib, m.is_test, i32::from(m.optimize_level) + ), + PythonResource::ModuleBytecodeRequest(_) => println!( + "PythonModuleBytecodeRequest {{ you should never see this }}" + ), + PythonResource::PackageResource(r) => println!( + "PythonPackageResource {{ package: {}, name: {}, is_stdlib: {}, is_test: {} }}", r.leaf_package, r.relative_name, r.is_stdlib, r.is_test + ), + PythonResource::PackageDistributionResource(r) => println!( + "PythonPackageDistributionResource {{ package: {}, version: {}, name: {} }}", r.package, r.version, r.name + ), + PythonResource::ExtensionModule(em) => { + println!( + "PythonExtensionModule {{" + ); + println!(" name: {}", em.name); + println!(" is_builtin: {}", em.builtin_default); + println!(" has_shared_library: {}", em.shared_library.is_some()); + println!(" has_object_files: {}", !em.object_file_data.is_empty()); + println!(" link_libraries: {:?}", em.link_libraries); + println!("}}"); + }, + PythonResource::EggFile(e) => println!( + "PythonEggFile {{ path: {} }}", match &e.data { + DataLocation::Path(p) => p.display().to_string(), + DataLocation::Memory(_) => "memory".to_string(), + } + ), + PythonResource::PathExtension(_pe) => println!( + "PythonPathExtension", + ), + } +} + /// Initialize a PyOxidizer configuration file in a given directory. pub fn init_config_file( project_dir: &Path,