Skip to content

Commit

Permalink
Merge branch 'recovery-git' of 'https://github.com/jjmerchante/grimoi…
Browse files Browse the repository at this point in the history
…relab-perceval'

Merges #838
Closes #838
  • Loading branch information
sduenas authored Mar 12, 2024
2 parents 319ae97 + 2db6922 commit 5ab5dcf
Show file tree
Hide file tree
Showing 3 changed files with 297 additions and 2 deletions.
121 changes: 119 additions & 2 deletions perceval/backends/core/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def __init__(self, uri, gitpath, tag=None, archive=None, ssl_verify=True):
self.gitpath = gitpath

def fetch(self, category=CATEGORY_COMMIT, from_date=DEFAULT_DATETIME, to_date=DEFAULT_LAST_DATETIME,
branches=None, latest_items=False, no_update=False):
branches=None, latest_items=False, recovery_commit=None, no_update=False):
"""Fetch commits.
The method retrieves from a Git repository or a log file
Expand Down Expand Up @@ -118,6 +118,7 @@ def fetch(self, category=CATEGORY_COMMIT, from_date=DEFAULT_DATETIME, to_date=DE
:param branches: names of branches to fetch from (default: None)
:param latest_items: sync with the repository to fetch only the
newest commits
:param recovery_commit: recover from this commit no updating the repo
:param no_update: if enabled, don't update the repo with the latest changes
:returns: a generator of commits
Expand All @@ -132,6 +133,7 @@ def fetch(self, category=CATEGORY_COMMIT, from_date=DEFAULT_DATETIME, to_date=DE
'to_date': to_date,
'branches': branches,
'latest_items': latest_items,
'recovery_commit': recovery_commit,
'no_update': no_update
}
items = super().fetch(category, **kwargs)
Expand All @@ -151,11 +153,14 @@ def fetch_items(self, category, **kwargs):
branches = kwargs['branches']
latest_items = kwargs['latest_items']
no_update = kwargs['no_update']
recovery_commit = kwargs['recovery_commit']

ncommits = 0

try:
if os.path.isfile(self.gitpath):
if recovery_commit:
commits = self._recovery(recovery_commit, from_date, to_date, branches)
elif os.path.isfile(self.gitpath):
commits = self._fetch_from_log()
else:
commits = self._fetch_from_repo(from_date, to_date, branches,
Expand Down Expand Up @@ -186,6 +191,20 @@ def has_resuming(cls):
"""
return True

def metadata(self, item, filter_classified=False):
"""Git metadata.
This method takes items, overriding `metadata` decorator,
to add extra information related to Git.
:param item: an item fetched by a backend
:param filter_classified: sets if classified fields were filtered
"""
item = super().metadata(item, filter_classified=filter_classified)
item['offset'] = item['data']['commit']

return item

@staticmethod
def metadata_id(item):
"""Extracts the identifier from a Git item."""
Expand Down Expand Up @@ -321,6 +340,55 @@ def _fetch_newest_commits_from_repo(self, repo):
gitshow = repo.show(hashes)
return self.parse_git_log_from_iter(gitshow)

def __fetch_from_packs(self, repo, packs, from_commit):
"""Retrieve commits from packfiles starting with the pack containing from_commit"""

hashes = repo.get_commits_from_packs(packs, from_commit)
gitshow = repo.show(hashes)
commits = self.parse_git_log_from_iter(gitshow)

return commits

def _recovery(self, from_commit, from_date, to_date, branches):
"""Recover Perceval execution from a specific commit
If the path is a Git log file, resume the execution using the
Git file.
When the path is a directory, there are two cases to consider:
If the repository contains only loose objects without packfiles,
or a single packfile without loose objects (this occurs when the
repository is large enough or to reduce storage space), fetch the
commits as it was the first execution. This involves using the
'__fetch_from_repository' method, which retrieves commits using the
log.
If the repository contains more than one packfile, or has loose
objects and one packfile, we can deduce that the packfile is from the
last execution. In this case, we will fetch the commits using the
'_from_packs' method.
"""
if os.path.isfile(self.gitpath):
commits = self._fetch_from_log()
else:
repo = self._create_git_repository()
packs = repo.packs_by_date()
if not packs or (len(packs) == 1 and not repo.has_loose_objects()):
commits = self._fetch_from_repo(from_date=from_date, to_date=to_date,
branches=branches, no_update=True)
else:
commits = self.__fetch_from_packs(repo, packs, from_commit)

# Only commits after from_commit
found = False
for commit in commits:
if not found and commit['commit'] == from_commit:
found = True

if found:
yield commit

def _create_git_repository(self):
if not os.path.exists(self.gitpath):
repo = GitRepository.clone(self.uri, self.gitpath, self.ssl_verify)
Expand Down Expand Up @@ -380,6 +448,8 @@ def setup_cmd_parser(cls):
exgroup_fetch.add_argument('--latest-items', dest='latest_items',
action='store_true',
help="Fetch latest commits added to the repository")
exgroup_fetch.add_argument('--recovery', dest='recovery_commit',
help="Recover the last execution from a commit")
exgroup_fetch.add_argument('--no-update', dest='no_update',
action='store_true',
help="Fetch all commits without updating the repository")
Expand Down Expand Up @@ -1112,6 +1182,53 @@ def show(self, commits=None, encoding='utf-8'):
logger.debug("Git show fetched from %s repository (%s)",
self.uri, self.dirpath)

def get_commits_from_packs(self, packs, from_commit):
"""Get commits from a specific one using fetched packfiles"""

hashes = []
found = False

for pack in packs:
commits = self._read_commits_from_pack(pack)
for commit in commits:
if not found and from_commit == commit:
found = True

if found:
hashes.append(commit)

return hashes

def packs_by_date(self):
"""Get all packs ordered by date"""

packs_dir = os.path.join(self.dirpath, 'objects/pack/')

files = os.listdir(packs_dir)
# Sort by date, from older to newer
files.sort(key=lambda x: os.path.getmtime(os.path.join(packs_dir, x)))
packs = [f.split('.')[0].split('-')[1]
for f in files
if f.endswith('.idx')]

return packs

def has_loose_objects(self):
"""Check if the repository has loose objects"""

cmd_count_objects = ['git', 'count-objects', '-v']

outs = self._exec(cmd_count_objects, cwd=self.dirpath, env=self.gitenv)
outs = outs.decode('utf-8', errors='surrogateescape').rstrip()

for line in outs.split('\n'):
if line.startswith('count:'):
count = int(line.split(':')[1].strip())
return count > 0
else:
msg = "Unexpected output format from 'git count-objects -v'"
raise RepositoryError(cause=msg)

def _fetch_pack(self):
"""Fetch changes and store them in a pack."""

Expand Down
12 changes: 12 additions & 0 deletions releases/unreleased/include-recovery-mode-for-git-backend.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
title: Include recovery mode for Git backend
category: added
author: null
issue: null
notes: >
Include a new option in Git that allows continuing
to fetch commits from the previous execution using the last
commit. The option is `--recovery <commit>`.
The last commit can be obtained from the offset in the summary
object of the last execution or the last item.
166 changes: 166 additions & 0 deletions tests/test_git.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,19 @@ def test_has_resuming(self):

self.assertEqual(Git.has_resuming(), True)

def test_metadata(self):
"""Test that the offset metadata is the commit from the item"""

new_path = os.path.join(self.tmp_path, 'newgit')

git = Git(self.git_path, new_path)
commits = [commit for commit in git.fetch()]

for commit in commits:
self.assertEqual(commit['offset'], commit['data']['commit'])

shutil.rmtree(new_path)

def test_fetch_submodules(self):
"""Test whether repositories with submodules are correctly fetched"""

Expand Down Expand Up @@ -706,6 +719,111 @@ def test_fetch_from_file(self):
self.assertEqual(commit['category'], 'commit')
self.assertEqual(commit['tag'], 'http://example.com.git')

def test_fetch_recovery_from_repo(self):
"""Test whether recovery from a commits in first execution works"""

origin_path = os.path.join(self.tmp_repo_path, 'gittest')
editable_path = os.path.join(self.tmp_path, 'editgit')
new_path = os.path.join(self.tmp_path, 'newgit')

shutil.copytree(origin_path, editable_path)

git = Git(editable_path, new_path)
commits = [commit for commit in git.fetch()]

# Count the number of commits before adding some new
expected = [('bc57a9209f096a130dcc5ba7089a8663f758a703', 1344965413.0),
('87783129c3f00d2c81a3a8e585eb86a47e39891a', 1344965535.0),
('7debcf8a2f57f86663809c58b5c07a398be7674c', 1344965607.0),
('c0d66f92a95e31c77be08dc9d0f11a16715d1885', 1344965702.0),
('c6ba8f7a1058db3e6b4bc6f1090e932b107605fb', 1344966351.0),
('589bb080f059834829a2a5955bebfd7c2baa110a', 1344967441.0),
('ce8e0b86a1e9877f42fe9453ede418519115f367', 1392185269.0),
('51a3b654f252210572297f47597b31527c475fb8', 1392185366.0),
('456a68ee1407a77f3e804a30dff245bb6c6b872f', 1392185439.0)]

self.assertEqual(len(commits), len(expected))

for x in range(len(commits)):
expected_uuid = uuid(editable_path, expected[x][0])
commit = commits[x]
self.assertEqual(commit['uuid'], expected_uuid)
self.assertEqual(commit['data']['commit'], expected[x][0])

# Check we can recover from a commit
from_commit = 'c6ba8f7a1058db3e6b4bc6f1090e932b107605fb'
commits_recovery = [commit for commit in git.fetch(recovery_commit=from_commit)]

expected_recovery = expected[4:]

self.assertEqual(len(commits_recovery), len(expected_recovery))

for x in range(len(commits_recovery)):
expected_uuid = uuid(editable_path, expected_recovery[x][0])
commit = commits_recovery[x]
self.assertEqual(commit['uuid'], expected_uuid)
self.assertEqual(commit['data']['commit'], expected_recovery[x][0])

# Cleanup
shutil.rmtree(editable_path)
shutil.rmtree(new_path)

def test_fetch_recovery_from_packs(self):
"""Test whether recovery from a commits in a repo with packs works"""

origin_path = os.path.join(self.tmp_repo_path, 'gittest')
editable_path = os.path.join(self.tmp_path, 'editgit')
new_path = os.path.join(self.tmp_path, 'newgit')
new_file = os.path.join(editable_path, 'newfile')

shutil.copytree(origin_path, editable_path)

git = Git(editable_path, new_path)
_ = [commit for commit in git.fetch()]

# Create some new commits
cmd = ['git', 'checkout', '-b', 'mybranch']
subprocess.check_output(cmd, stderr=subprocess.STDOUT,
cwd=editable_path, env={'LANG': 'C'})

with open(new_file, 'w') as f:
f.write("Testing sync method")

cmd = ['git', 'add', new_file]
subprocess.check_output(cmd, stderr=subprocess.STDOUT,
cwd=editable_path, env={'LANG': 'C'})

cmd = ['git', '-c', 'user.name="mock"',
'-c', 'user.email="[email protected]"',
'commit', '-m', 'Testing sync']
subprocess.check_output(cmd, stderr=subprocess.STDOUT,
cwd=editable_path, env={'LANG': 'C'})

cmd = ['git', 'rm', new_file]
subprocess.check_output(cmd, stderr=subprocess.STDOUT,
cwd=editable_path, env={'LANG': 'C'})

cmd = ['git', '-c', 'user.name="mock"',
'-c', 'user.email="[email protected]"',
'commit', '-m', 'Removing testing file for sync']
subprocess.check_output(cmd, stderr=subprocess.STDOUT,
cwd=editable_path, env={'LANG': 'C'})

# Two new commits should have been fetched
commits = [commit for commit in git.fetch(latest_items=True)]
self.assertEqual(len(commits), 2)

# Check if we can recover from the last packfile
from_commit = commits[0]['data']['commit']
commits_recovery = [commit for commit in git.fetch(recovery_commit=from_commit)]
self.assertEqual(len(commits_recovery), 2)
self.assertEqual(commits_recovery[0]['uuid'], commits[0]['uuid'])
self.assertEqual(commits_recovery[1]['uuid'], commits[1]['uuid'])

# Cleanup
shutil.rmtree(editable_path)
shutil.rmtree(new_path)

def test_git_parser(self):
"""Test if the static method parses a git log file"""

Expand Down Expand Up @@ -873,6 +991,18 @@ def test_setup_cmd_parser(self):
self.assertEqual(parsed_args.uri, 'http://example.com/')
self.assertFalse(parsed_args.ssl_verify)

args = ['http://example.com/',
'--git-path', '/tmp/gitpath',
'--recovery', 'foocommit']

parsed_args = parser.parse(*args)
self.assertEqual(parsed_args.git_path, '/tmp/gitpath')
self.assertEqual(parsed_args.uri, 'http://example.com/')
self.assertEqual(parsed_args.recovery_commit, 'foocommit')
self.assertFalse(parsed_args.no_update)
self.assertFalse(parsed_args.latest_items)
self.assertTrue(parsed_args.ssl_verify)

def test_mutual_exclusive_update(self):
"""Test whether an exception is thrown when no-update and latest-items flags are set"""

Expand All @@ -885,6 +1015,18 @@ def test_mutual_exclusive_update(self):
with self.assertRaises(SystemExit):
_ = parser.parse(*args)

def test_mutual_exclusive_recovery(self):
"""Test whether an exception is thrown when recovery and latest-items flags are set"""

parser = GitCommand.setup_cmd_parser()
args = ['http://example.com/',
'--git-path', '/tmp/gitpath',
'--recovery', 'foocommit',
'--latest-items']

with self.assertRaises(SystemExit):
_ = parser.parse(*args)


class TestGitParser(TestCaseGit):
"""Git parser tests"""
Expand Down Expand Up @@ -1826,6 +1968,30 @@ def test_git_show(self):

shutil.rmtree(new_path)

def test_has_loose_objects(self):
"""Test if the repository has loose objects"""

new_path = os.path.join(self.tmp_path, 'newgit')

repo = GitRepository.clone(self.git_path, new_path)

# Create a loose object in the repository
process = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'],
stdin=subprocess.PIPE,
cwd=new_path,
env={'LANG': 'C'})
process.communicate(input=b"Data test")

self.assertTrue(repo.has_loose_objects())

# Group loose objects in a packfile and remove unreachable objects
subprocess.run(['git', 'gc'], cwd=new_path, check=True)
subprocess.run(['git', 'prune'], cwd=new_path, check=True)

self.assertFalse(repo.has_loose_objects())

shutil.rmtree(new_path)

def test_show_alternates(self):
"""Test show command with alternate objects"""

Expand Down

0 comments on commit 5ab5dcf

Please sign in to comment.