Skip to content

Commit

Permalink
feat(general): support UTF-16 and other encodings in multiple framewo…
Browse files Browse the repository at this point in the history
…rks (#5308)

* support UTF-16 and other encodings in multiple frameworks

* linting
  • Loading branch information
gruebel committed Jul 10, 2023
1 parent 40e3626 commit 456225b
Show file tree
Hide file tree
Showing 15 changed files with 66 additions and 55 deletions.
3 changes: 2 additions & 1 deletion checkov/ansible/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from checkov.ansible.graph_builder.graph_components.resource_types import ResourceType
from checkov.common.parsers.yaml.parser import parse
from checkov.common.util.consts import START_LINE, END_LINE
from checkov.common.util.file_utils import read_file_with_any_encoding
from checkov.common.util.suppression import collect_suppressions_for_context

TASK_NAME_PATTERN = re.compile(r"^\s*-\s+name:\s+", re.MULTILINE)
Expand Down Expand Up @@ -76,7 +77,7 @@ def get_relevant_file_content(file_path: str | Path) -> str | None:
if not str(file_path).endswith((".yaml", ".yml")):
return None

content = Path(file_path).read_text()
content = read_file_with_any_encoding(file_path=file_path)
match_task_name = re.search(TASK_NAME_PATTERN, content)
if match_task_name:
# there are more files, which belong to an ansible playbook,
Expand Down
4 changes: 2 additions & 2 deletions checkov/argo_workflows/runner.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from __future__ import annotations

import re
from pathlib import Path
from typing import TYPE_CHECKING, Any

from checkov.common.images.image_referencer import ImageReferencer, Image
from checkov.common.output.report import CheckType
from checkov.common.util.file_utils import read_file_with_any_encoding
from checkov.yaml_doc.runner import Runner as YamlRunner

# Import of the checks registry for a specific resource type
Expand Down Expand Up @@ -44,7 +44,7 @@ def _get_workflow_file_content(self, file_path: str) -> str | None:
if not file_path.endswith((".yaml", ".yml")):
return None

content = Path(file_path).read_text()
content = read_file_with_any_encoding(file_path=file_path)
match_api = re.search(API_VERSION_PATTERN, content)
if match_api:
match_kind = re.search(KIND_PATTERN, content)
Expand Down
11 changes: 2 additions & 9 deletions checkov/arm/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@
from pathlib import Path
from typing import Any

from charset_normalizer import from_path
from yaml.scanner import ScannerError
from yaml import YAMLError

from checkov.common.parsers.json import parse as json_parse
from checkov.common.parsers.yaml import loader

from checkov.common.util.file_utils import read_file_with_any_encoding

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -57,13 +56,7 @@ def load(filename: Path | str) -> tuple[dict[str, Any], list[tuple[int, str]]]:
Load the given JSON/YAML file
"""

file_path = filename if isinstance(filename, Path) else Path(filename)

try:
content = file_path.read_text()
except UnicodeDecodeError:
logging.debug(f"Encoding for file {file_path} is not UTF-8, trying to detect it")
content = str(from_path(file_path).best())
content = read_file_with_any_encoding(file_path=filename)

if not all(key in content for key in ("$schema", "contentVersion")):
return {}, []
Expand Down
4 changes: 3 additions & 1 deletion checkov/bicep/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

from pycep import BicepParser

from checkov.common.util.file_utils import read_file_with_any_encoding

if TYPE_CHECKING:
from pycep.typing import BicepJson

Expand All @@ -18,7 +20,7 @@ def __init__(self) -> None:

def parse(self, file_path: Path) -> tuple[BicepJson, list[tuple[int, str]]] | tuple[None, None]:
try:
content = file_path.read_text()
content = read_file_with_any_encoding(file_path=file_path)
template = self.bicep_parser.parse(text=content)
except Exception:
logging.debug(f"[bicep] Couldn't parse {file_path}", exc_info=True)
Expand Down
7 changes: 2 additions & 5 deletions checkov/cloudformation/parser/cfn_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from checkov.common.parsers.json.decoder import SimpleDecoder
from checkov.common.parsers.node import StrNode, DictNode, ListNode
from checkov.common.util.consts import MAX_IAC_FILE_SIZE
from checkov.common.util.file_utils import read_file_with_any_encoding

try:
from yaml.cyaml import CParser as Parser # type:ignore[attr-defined]
Expand Down Expand Up @@ -253,11 +254,7 @@ def load(filename: str | Path, content_type: ContentType) -> tuple[dict[str, Any
LOGGER.error(f"Encoding for file {file_path} could not be detected or read. Please try encoding the file as UTF-8.")
raise e
else:
try:
content = file_path.read_text()
except UnicodeDecodeError:
LOGGER.info(f"Encoding for file {file_path} is not UTF-8, trying to detect it")
content = str(from_path(file_path).best())
content = read_file_with_any_encoding(file_path=file_path)

if content_type == ContentType.CFN and "Resources" not in content:
logging.debug(f'File {file_path} is expected to be a CFN template but has no Resources attribute')
Expand Down
12 changes: 3 additions & 9 deletions checkov/common/parsers/json/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@
from pathlib import Path
from typing import Any

from charset_normalizer import from_path

from checkov.common.parsers.json.decoder import Decoder
from checkov.common.parsers.json.errors import DecodeError
from checkov.common.util.file_utils import read_file_with_any_encoding

LOGGER = logging.getLogger(__name__)

Expand All @@ -24,13 +23,8 @@ def load(
Load the given JSON file
"""

try:
if not content:
file_path = filename if isinstance(filename, Path) else Path(filename)
content = file_path.read_text()
except UnicodeDecodeError:
LOGGER.info(f"Encoding for file {filename} is not UTF-8, trying to detect it")
content = str(from_path(filename).best()) # type:ignore[arg-type] # somehow str is not recognized as PathLike
if not content:
content = read_file_with_any_encoding(file_path=filename)

file_lines = [(idx + 1, line) for idx, line in enumerate(content.splitlines(keepends=True))]

Expand Down
5 changes: 3 additions & 2 deletions checkov/common/parsers/yaml/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import yaml
from yaml.loader import SafeLoader

from checkov.common.util.file_utils import read_file_with_any_encoding

if TYPE_CHECKING:
from yaml import MappingNode

Expand All @@ -31,8 +33,7 @@ def load(filename: str | Path, content: str | None = None) -> tuple[list[dict[st
"""

if not content:
file_path = filename if isinstance(filename, Path) else Path(filename)
content = file_path.read_text()
content = read_file_with_any_encoding(file_path=filename)

file_lines = [(idx + 1, line) for idx, line in enumerate(content.splitlines(keepends=True))]

Expand Down
22 changes: 21 additions & 1 deletion checkov/common/util/file_utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
from __future__ import annotations

import os.path
import tarfile
import base64
import gzip
import io
import logging

from pathlib import Path
from zipfile import ZipFile

from charset_normalizer import from_path

logger = logging.getLogger(__name__)


def convert_to_unix_path(path: str) -> str:
return path.replace('\\', '/')
Expand Down Expand Up @@ -89,3 +95,17 @@ def get_file_size_safe(file_path: str) -> int:
extra={"file_path": file_path}
)
return -1


def read_file_with_any_encoding(file_path: str | Path) -> str:
"""Read the file with the system encoding and then try to detect it"""

file_path = file_path if isinstance(file_path, Path) else Path(file_path)

try:
content = file_path.read_text()
except UnicodeDecodeError:
logger.info(f"Encoding for file {file_path} is not UTF-8, trying to detect it")
content = str(from_path(file_path).best())

return content
3 changes: 2 additions & 1 deletion checkov/github_actions/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from checkov.common.parsers.yaml.loader import SafeLineLoaderGhaSchema
from checkov.common.parsers.yaml.parser import parse
from checkov.common.util.file_utils import read_file_with_any_encoding
from checkov.common.util.type_forcers import force_dict
from checkov.github_actions.graph_builder.graph_components.resource_types import ResourceType
from checkov.github_actions.schemas import gha_schema, gha_workflow
Expand Down Expand Up @@ -41,7 +42,7 @@ def parse_file(

if is_workflow_file(file_path):
if not file_content:
file_content = file_path.read_text()
file_content = read_file_with_any_encoding(file_path=file_path)

entity_schema = parse(filename=str(f), file_content=file_content)

Expand Down
11 changes: 3 additions & 8 deletions checkov/kubernetes/parser/k8_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
from typing import Tuple, Dict, Any, List, TYPE_CHECKING

import yaml
from charset_normalizer import from_path
from yaml.loader import SafeLoader

from checkov.common.util.file_utils import read_file_with_any_encoding

if TYPE_CHECKING:
from yaml import MappingNode

Expand Down Expand Up @@ -38,13 +39,7 @@ def load(filename: Path) -> Tuple[List[Dict[str, Any]], List[Tuple[int, str]]]:
Load the given JSON file
"""

file_path = filename if isinstance(filename, Path) else Path(filename)

try:
content = file_path.read_text()
except UnicodeDecodeError:
logger.debug(f"Encoding for file {file_path} is not UTF-8, trying to detect it")
content = str(from_path(file_path).best())
content = read_file_with_any_encoding(file_path=filename)

if not all(key in content for key in ("apiVersion", "kind")):
return [{}], []
Expand Down
11 changes: 3 additions & 8 deletions checkov/kubernetes/parser/k8_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
from typing import List, Dict, Any, Tuple, TYPE_CHECKING

import yaml
from charset_normalizer import from_path
from yaml.loader import SafeLoader

from checkov.common.util.file_utils import read_file_with_any_encoding

if TYPE_CHECKING:
from yaml import MappingNode

Expand All @@ -34,13 +35,7 @@ def load(filename: Path) -> Tuple[List[Dict[str, Any]], List[Tuple[int, str]]]:
Load the given YAML file
"""

file_path = filename if isinstance(filename, Path) else Path(filename)

try:
content = file_path.read_text()
except UnicodeDecodeError:
logging.debug(f"Encoding for file {file_path} is not UTF-8, trying to detect it")
content = str(from_path(file_path).best())
content = read_file_with_any_encoding(file_path=filename)

if not all(key in content for key in ("apiVersion", "kind")):
return [{}], []
Expand Down
4 changes: 2 additions & 2 deletions checkov/openapi/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from checkov.common.checks.base_check_registry import BaseCheckRegistry
from checkov.common.bridgecrew.check_type import CheckType
from checkov.common.util.file_utils import read_file_with_any_encoding
from checkov.yaml_doc.runner import Runner as YamlRunner
from checkov.json_doc.runner import Runner as JsonRunner
from pathlib import Path
Expand Down Expand Up @@ -96,8 +97,7 @@ def get_resource(self, file_path: str, key: str, supported_entities: Iterable[st
return ",".join(supported_entities)

def load_file(self, filename: str | Path) -> str:
file_path = filename if isinstance(filename, Path) else Path(filename)
content = file_path.read_text()
content = read_file_with_any_encoding(file_path=filename)
return content

def pre_validate_file(self, file_content: str) -> bool:
Expand Down
8 changes: 2 additions & 6 deletions checkov/terraform_json/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
from pathlib import Path
from typing import Any

from charset_normalizer import from_path
from yaml.scanner import ScannerError
from yaml import YAMLError

from checkov.common.parsers.json import parse as json_parse
from checkov.common.parsers.yaml import loader
from checkov.common.util.consts import LINE_FIELD_NAMES
from checkov.common.util.file_utils import read_file_with_any_encoding
from checkov.terraform.graph_builder.graph_components.block_types import BlockType

COMMENT_FIELD_NAME = "//"
Expand Down Expand Up @@ -61,11 +61,7 @@ def parse(file_path: Path) -> tuple[dict[str, Any], list[tuple[int, str]]] | tup
def loads(file_path: Path) -> tuple[dict[str, Any], list[tuple[int, str]]]:
"""Loads the given JSON file with line numbers"""

try:
content = file_path.read_text()
except UnicodeDecodeError:
logging.debug(f"Encoding for file {file_path} is not UTF-8, trying to detect it")
content = str(from_path(file_path).best())
content = read_file_with_any_encoding(file_path=file_path)

if not all(key in content for key in ("resource", "provider")):
return {}, []
Expand Down
Binary file added tests/ansible/examples/k8s_utf16.yaml
Binary file not shown.
16 changes: 16 additions & 0 deletions tests/ansible/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,3 +392,19 @@ def test_get_resource_without_name(graph_connector):

# then
assert new_key == "tasks.amazon.aws.ec2_instance.unknown"


def test_runner_process_utf16_file():
# given
test_file = EXAMPLES_DIR / "k8s_utf16.yaml"

# when
report = Runner().run(root_folder="", files=[str(test_file)])

# then
summary = report.get_summary()

assert summary["passed"] == 0
assert summary["failed"] == 0
assert summary["skipped"] == 0
assert summary["parsing_errors"] == 0

0 comments on commit 456225b

Please sign in to comment.