From 2db69225b2033f6f73baa224ddbcb7035b56e3f2 Mon Sep 17 00:00:00 2001 From: Jose Javier Merchante Date: Fri, 1 Mar 2024 17:30:35 +0200 Subject: [PATCH] [git] Add recovery functionality to fetch method This commit implements the recovery functionality in the fetch method of the Git backend, enabling the recovery of Perceval execution from a specific commit. Now, users can specify a recovery commit to start fetching commits from that point, ensuring robustness against failures. Signed-off-by: Jose Javier Merchante --- perceval/backends/core/git.py | 121 ++++++++++++- .../include-recovery-mode-for-git-backend.yml | 12 ++ tests/test_git.py | 166 ++++++++++++++++++ 3 files changed, 297 insertions(+), 2 deletions(-) create mode 100644 releases/unreleased/include-recovery-mode-for-git-backend.yml diff --git a/perceval/backends/core/git.py b/perceval/backends/core/git.py index 284406e0c..624a38240 100644 --- a/perceval/backends/core/git.py +++ b/perceval/backends/core/git.py @@ -83,7 +83,7 @@ def __init__(self, uri, gitpath, tag=None, archive=None, ssl_verify=True): self.gitpath = gitpath def fetch(self, category=CATEGORY_COMMIT, from_date=DEFAULT_DATETIME, to_date=DEFAULT_LAST_DATETIME, - branches=None, latest_items=False, no_update=False): + branches=None, latest_items=False, recovery_commit=None, no_update=False): """Fetch commits. The method retrieves from a Git repository or a log file @@ -118,6 +118,7 @@ def fetch(self, category=CATEGORY_COMMIT, from_date=DEFAULT_DATETIME, to_date=DE :param branches: names of branches to fetch from (default: None) :param latest_items: sync with the repository to fetch only the newest commits + :param recovery_commit: recover from this commit no updating the repo :param no_update: if enabled, don't update the repo with the latest changes :returns: a generator of commits @@ -132,6 +133,7 @@ def fetch(self, category=CATEGORY_COMMIT, from_date=DEFAULT_DATETIME, to_date=DE 'to_date': to_date, 'branches': branches, 'latest_items': latest_items, + 'recovery_commit': recovery_commit, 'no_update': no_update } items = super().fetch(category, **kwargs) @@ -151,11 +153,14 @@ def fetch_items(self, category, **kwargs): branches = kwargs['branches'] latest_items = kwargs['latest_items'] no_update = kwargs['no_update'] + recovery_commit = kwargs['recovery_commit'] ncommits = 0 try: - if os.path.isfile(self.gitpath): + if recovery_commit: + commits = self._recovery(recovery_commit, from_date, to_date, branches) + elif os.path.isfile(self.gitpath): commits = self._fetch_from_log() else: commits = self._fetch_from_repo(from_date, to_date, branches, @@ -186,6 +191,20 @@ def has_resuming(cls): """ return True + def metadata(self, item, filter_classified=False): + """Git metadata. + + This method takes items, overriding `metadata` decorator, + to add extra information related to Git. + + :param item: an item fetched by a backend + :param filter_classified: sets if classified fields were filtered + """ + item = super().metadata(item, filter_classified=filter_classified) + item['offset'] = item['data']['commit'] + + return item + @staticmethod def metadata_id(item): """Extracts the identifier from a Git item.""" @@ -321,6 +340,55 @@ def _fetch_newest_commits_from_repo(self, repo): gitshow = repo.show(hashes) return self.parse_git_log_from_iter(gitshow) + def __fetch_from_packs(self, repo, packs, from_commit): + """Retrieve commits from packfiles starting with the pack containing from_commit""" + + hashes = repo.get_commits_from_packs(packs, from_commit) + gitshow = repo.show(hashes) + commits = self.parse_git_log_from_iter(gitshow) + + return commits + + def _recovery(self, from_commit, from_date, to_date, branches): + """Recover Perceval execution from a specific commit + + If the path is a Git log file, resume the execution using the + Git file. + + When the path is a directory, there are two cases to consider: + + If the repository contains only loose objects without packfiles, + or a single packfile without loose objects (this occurs when the + repository is large enough or to reduce storage space), fetch the + commits as it was the first execution. This involves using the + '__fetch_from_repository' method, which retrieves commits using the + log. + + If the repository contains more than one packfile, or has loose + objects and one packfile, we can deduce that the packfile is from the + last execution. In this case, we will fetch the commits using the + '_from_packs' method. + """ + if os.path.isfile(self.gitpath): + commits = self._fetch_from_log() + else: + repo = self._create_git_repository() + packs = repo.packs_by_date() + if not packs or (len(packs) == 1 and not repo.has_loose_objects()): + commits = self._fetch_from_repo(from_date=from_date, to_date=to_date, + branches=branches, no_update=True) + else: + commits = self.__fetch_from_packs(repo, packs, from_commit) + + # Only commits after from_commit + found = False + for commit in commits: + if not found and commit['commit'] == from_commit: + found = True + + if found: + yield commit + def _create_git_repository(self): if not os.path.exists(self.gitpath): repo = GitRepository.clone(self.uri, self.gitpath, self.ssl_verify) @@ -380,6 +448,8 @@ def setup_cmd_parser(cls): exgroup_fetch.add_argument('--latest-items', dest='latest_items', action='store_true', help="Fetch latest commits added to the repository") + exgroup_fetch.add_argument('--recovery', dest='recovery_commit', + help="Recover the last execution from a commit") exgroup_fetch.add_argument('--no-update', dest='no_update', action='store_true', help="Fetch all commits without updating the repository") @@ -1112,6 +1182,53 @@ def show(self, commits=None, encoding='utf-8'): logger.debug("Git show fetched from %s repository (%s)", self.uri, self.dirpath) + def get_commits_from_packs(self, packs, from_commit): + """Get commits from a specific one using fetched packfiles""" + + hashes = [] + found = False + + for pack in packs: + commits = self._read_commits_from_pack(pack) + for commit in commits: + if not found and from_commit == commit: + found = True + + if found: + hashes.append(commit) + + return hashes + + def packs_by_date(self): + """Get all packs ordered by date""" + + packs_dir = os.path.join(self.dirpath, 'objects/pack/') + + files = os.listdir(packs_dir) + # Sort by date, from older to newer + files.sort(key=lambda x: os.path.getmtime(os.path.join(packs_dir, x))) + packs = [f.split('.')[0].split('-')[1] + for f in files + if f.endswith('.idx')] + + return packs + + def has_loose_objects(self): + """Check if the repository has loose objects""" + + cmd_count_objects = ['git', 'count-objects', '-v'] + + outs = self._exec(cmd_count_objects, cwd=self.dirpath, env=self.gitenv) + outs = outs.decode('utf-8', errors='surrogateescape').rstrip() + + for line in outs.split('\n'): + if line.startswith('count:'): + count = int(line.split(':')[1].strip()) + return count > 0 + else: + msg = "Unexpected output format from 'git count-objects -v'" + raise RepositoryError(cause=msg) + def _fetch_pack(self): """Fetch changes and store them in a pack.""" diff --git a/releases/unreleased/include-recovery-mode-for-git-backend.yml b/releases/unreleased/include-recovery-mode-for-git-backend.yml new file mode 100644 index 000000000..5c8975f2f --- /dev/null +++ b/releases/unreleased/include-recovery-mode-for-git-backend.yml @@ -0,0 +1,12 @@ +--- +title: Include recovery mode for Git backend +category: added +author: null +issue: null +notes: > + Include a new option in Git that allows continuing + to fetch commits from the previous execution using the last + commit. The option is `--recovery `. + + The last commit can be obtained from the offset in the summary + object of the last execution or the last item. diff --git a/tests/test_git.py b/tests/test_git.py index b783ac29f..264d56fee 100644 --- a/tests/test_git.py +++ b/tests/test_git.py @@ -129,6 +129,19 @@ def test_has_resuming(self): self.assertEqual(Git.has_resuming(), True) + def test_metadata(self): + """Test that the offset metadata is the commit from the item""" + + new_path = os.path.join(self.tmp_path, 'newgit') + + git = Git(self.git_path, new_path) + commits = [commit for commit in git.fetch()] + + for commit in commits: + self.assertEqual(commit['offset'], commit['data']['commit']) + + shutil.rmtree(new_path) + def test_fetch_submodules(self): """Test whether repositories with submodules are correctly fetched""" @@ -706,6 +719,111 @@ def test_fetch_from_file(self): self.assertEqual(commit['category'], 'commit') self.assertEqual(commit['tag'], 'http://example.com.git') + def test_fetch_recovery_from_repo(self): + """Test whether recovery from a commits in first execution works""" + + origin_path = os.path.join(self.tmp_repo_path, 'gittest') + editable_path = os.path.join(self.tmp_path, 'editgit') + new_path = os.path.join(self.tmp_path, 'newgit') + + shutil.copytree(origin_path, editable_path) + + git = Git(editable_path, new_path) + commits = [commit for commit in git.fetch()] + + # Count the number of commits before adding some new + expected = [('bc57a9209f096a130dcc5ba7089a8663f758a703', 1344965413.0), + ('87783129c3f00d2c81a3a8e585eb86a47e39891a', 1344965535.0), + ('7debcf8a2f57f86663809c58b5c07a398be7674c', 1344965607.0), + ('c0d66f92a95e31c77be08dc9d0f11a16715d1885', 1344965702.0), + ('c6ba8f7a1058db3e6b4bc6f1090e932b107605fb', 1344966351.0), + ('589bb080f059834829a2a5955bebfd7c2baa110a', 1344967441.0), + ('ce8e0b86a1e9877f42fe9453ede418519115f367', 1392185269.0), + ('51a3b654f252210572297f47597b31527c475fb8', 1392185366.0), + ('456a68ee1407a77f3e804a30dff245bb6c6b872f', 1392185439.0)] + + self.assertEqual(len(commits), len(expected)) + + for x in range(len(commits)): + expected_uuid = uuid(editable_path, expected[x][0]) + commit = commits[x] + self.assertEqual(commit['uuid'], expected_uuid) + self.assertEqual(commit['data']['commit'], expected[x][0]) + + # Check we can recover from a commit + from_commit = 'c6ba8f7a1058db3e6b4bc6f1090e932b107605fb' + commits_recovery = [commit for commit in git.fetch(recovery_commit=from_commit)] + + expected_recovery = expected[4:] + + self.assertEqual(len(commits_recovery), len(expected_recovery)) + + for x in range(len(commits_recovery)): + expected_uuid = uuid(editable_path, expected_recovery[x][0]) + commit = commits_recovery[x] + self.assertEqual(commit['uuid'], expected_uuid) + self.assertEqual(commit['data']['commit'], expected_recovery[x][0]) + + # Cleanup + shutil.rmtree(editable_path) + shutil.rmtree(new_path) + + def test_fetch_recovery_from_packs(self): + """Test whether recovery from a commits in a repo with packs works""" + + origin_path = os.path.join(self.tmp_repo_path, 'gittest') + editable_path = os.path.join(self.tmp_path, 'editgit') + new_path = os.path.join(self.tmp_path, 'newgit') + new_file = os.path.join(editable_path, 'newfile') + + shutil.copytree(origin_path, editable_path) + + git = Git(editable_path, new_path) + _ = [commit for commit in git.fetch()] + + # Create some new commits + cmd = ['git', 'checkout', '-b', 'mybranch'] + subprocess.check_output(cmd, stderr=subprocess.STDOUT, + cwd=editable_path, env={'LANG': 'C'}) + + with open(new_file, 'w') as f: + f.write("Testing sync method") + + cmd = ['git', 'add', new_file] + subprocess.check_output(cmd, stderr=subprocess.STDOUT, + cwd=editable_path, env={'LANG': 'C'}) + + cmd = ['git', '-c', 'user.name="mock"', + '-c', 'user.email="mock@example.com"', + 'commit', '-m', 'Testing sync'] + subprocess.check_output(cmd, stderr=subprocess.STDOUT, + cwd=editable_path, env={'LANG': 'C'}) + + cmd = ['git', 'rm', new_file] + subprocess.check_output(cmd, stderr=subprocess.STDOUT, + cwd=editable_path, env={'LANG': 'C'}) + + cmd = ['git', '-c', 'user.name="mock"', + '-c', 'user.email="mock@example.com"', + 'commit', '-m', 'Removing testing file for sync'] + subprocess.check_output(cmd, stderr=subprocess.STDOUT, + cwd=editable_path, env={'LANG': 'C'}) + + # Two new commits should have been fetched + commits = [commit for commit in git.fetch(latest_items=True)] + self.assertEqual(len(commits), 2) + + # Check if we can recover from the last packfile + from_commit = commits[0]['data']['commit'] + commits_recovery = [commit for commit in git.fetch(recovery_commit=from_commit)] + self.assertEqual(len(commits_recovery), 2) + self.assertEqual(commits_recovery[0]['uuid'], commits[0]['uuid']) + self.assertEqual(commits_recovery[1]['uuid'], commits[1]['uuid']) + + # Cleanup + shutil.rmtree(editable_path) + shutil.rmtree(new_path) + def test_git_parser(self): """Test if the static method parses a git log file""" @@ -873,6 +991,18 @@ def test_setup_cmd_parser(self): self.assertEqual(parsed_args.uri, 'http://example.com/') self.assertFalse(parsed_args.ssl_verify) + args = ['http://example.com/', + '--git-path', '/tmp/gitpath', + '--recovery', 'foocommit'] + + parsed_args = parser.parse(*args) + self.assertEqual(parsed_args.git_path, '/tmp/gitpath') + self.assertEqual(parsed_args.uri, 'http://example.com/') + self.assertEqual(parsed_args.recovery_commit, 'foocommit') + self.assertFalse(parsed_args.no_update) + self.assertFalse(parsed_args.latest_items) + self.assertTrue(parsed_args.ssl_verify) + def test_mutual_exclusive_update(self): """Test whether an exception is thrown when no-update and latest-items flags are set""" @@ -885,6 +1015,18 @@ def test_mutual_exclusive_update(self): with self.assertRaises(SystemExit): _ = parser.parse(*args) + def test_mutual_exclusive_recovery(self): + """Test whether an exception is thrown when recovery and latest-items flags are set""" + + parser = GitCommand.setup_cmd_parser() + args = ['http://example.com/', + '--git-path', '/tmp/gitpath', + '--recovery', 'foocommit', + '--latest-items'] + + with self.assertRaises(SystemExit): + _ = parser.parse(*args) + class TestGitParser(TestCaseGit): """Git parser tests""" @@ -1826,6 +1968,30 @@ def test_git_show(self): shutil.rmtree(new_path) + def test_has_loose_objects(self): + """Test if the repository has loose objects""" + + new_path = os.path.join(self.tmp_path, 'newgit') + + repo = GitRepository.clone(self.git_path, new_path) + + # Create a loose object in the repository + process = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], + stdin=subprocess.PIPE, + cwd=new_path, + env={'LANG': 'C'}) + process.communicate(input=b"Data test") + + self.assertTrue(repo.has_loose_objects()) + + # Group loose objects in a packfile and remove unreachable objects + subprocess.run(['git', 'gc'], cwd=new_path, check=True) + subprocess.run(['git', 'prune'], cwd=new_path, check=True) + + self.assertFalse(repo.has_loose_objects()) + + shutil.rmtree(new_path) + def test_show_alternates(self): """Test show command with alternate objects"""