From e7f1935d61aa6e4553afdc1e86972ce4edd66adb Mon Sep 17 00:00:00 2001 From: rongzhang Date: Thu, 24 Oct 2024 14:46:57 +0000 Subject: [PATCH] feat: github client --- .../config/querybook_default_config.yaml | 3 + querybook/server/clients/github_client.py | 151 ++++++++++++++++++ querybook/server/datasources/github.py | 110 ++++++++++++- querybook/server/env.py | 3 + querybook/server/logic/github.py | 45 ++++-- .../test_github_client.py | 118 ++++++++++++++ requirements/extra.txt | 3 + requirements/github_integration/github.txt | 1 + 8 files changed, 415 insertions(+), 19 deletions(-) create mode 100644 querybook/server/clients/github_client.py create mode 100644 querybook/tests/test_lib/test_github_integration/test_github_client.py create mode 100644 requirements/github_integration/github.txt diff --git a/querybook/config/querybook_default_config.yaml b/querybook/config/querybook_default_config.yaml index 42bd611c2..939cbf088 100644 --- a/querybook/config/querybook_default_config.yaml +++ b/querybook/config/querybook_default_config.yaml @@ -106,3 +106,6 @@ VECTOR_STORE_PROVIDER: ~ VECTOR_STORE_CONFIG: embeddings_arg_name: 'embedding_function' index_name: 'vector_index_v1' + +# --------------- GitHub Integration --------------- +GITHUB_REPO_URL: ~ diff --git a/querybook/server/clients/github_client.py b/querybook/server/clients/github_client.py new file mode 100644 index 000000000..7229bf42f --- /dev/null +++ b/querybook/server/clients/github_client.py @@ -0,0 +1,151 @@ +from flask import session as flask_session +from github import Github, GithubException, Auth +from typing import List, Dict, Optional + +from lib.github_integration.serializers import ( + deserialize_datadoc_from_markdown, + serialize_datadoc_to_markdown, +) +from lib.logger import get_logger +from models.datadoc import DataDoc +from models.github import GitHubLink +from env import QuerybookSettings + +LOG = get_logger(__name__) + + +class GitHubClient: + def __init__(self, github_link: GitHubLink): + """ + Initialize the GitHub client with an access token from the session. + Raises an exception if the token is not found. + """ + self.github_link = github_link + self.datadoc = github_link.datadoc + access_token = self._get_access_token() + auth = Auth.Token(access_token) + self.client = Github(auth=auth, per_page=5) + self.user = self.client.get_user() + self.repo = self._get_repository() + self.branch = "main" + self.file_path = self._build_file_path() + + def _get_access_token(self) -> str: + access_token = flask_session.get("github_access_token") + if not access_token: + LOG.error("GitHub OAuth token not found in session") + raise Exception("GitHub OAuth token not found in session") + return access_token + + def _get_repository(self): + repo_url = QuerybookSettings.GITHUB_REPO_URL + if not repo_url: + LOG.error("GITHUB_REPO_URL is not configured") + raise Exception("GITHUB_REPO_URL is not configured") + repo_full_name = self._extract_repo_full_name(repo_url) + return self.client.get_repo(repo_full_name) + + @staticmethod + def _extract_repo_full_name(repo_url: str) -> str: + # Assumes repo_url is in the format 'https://github.com/owner/repo' + parts = repo_url.rstrip("/").split("/") + if len(parts) >= 2: + return f"{parts[-2]}/{parts[-1]}" + else: + raise ValueError("Invalid GITHUB_REPO_URL configuration") + + def _build_file_path(self) -> str: + directory = self.github_link.directory + file_name = f"datadoc_{self.datadoc.id}.md" + return f"{directory}/{file_name}" + + def commit_datadoc(self, commit_message: Optional[str] = None): + """ + Commit a DataDoc to the repository. + Args: + commit_message (Optional[str]): Commit message. Defaults to a standard message. + Raises: + Exception: If committing the DataDoc fails. + """ + content = serialize_datadoc_to_markdown(self.datadoc) + if not commit_message: + commit_message = ( + f"Update DataDoc {self.datadoc.id}: {self.datadoc.title or 'Untitled'}" + ) + + try: + contents = self.repo.get_contents(self.file_path, ref=self.branch) + # Update file + self.repo.update_file( + path=contents.path, + message=commit_message, + content=content, + sha=contents.sha, + branch=self.branch, + ) + LOG.info(f"Updated file {self.file_path} in repository.") + except GithubException as e: + if e.status == 404: + # Create new file + self.repo.create_file( + path=self.file_path, + message=commit_message, + content=content, + branch=self.branch, + ) + LOG.info(f"Created file {self.file_path} in repository.") + else: + LOG.error(f"GitHubException during commit: {e}") + raise Exception(f"Failed to commit DataDoc: {e}") + + def get_datadoc_versions(self, page: int = 1) -> List[Dict]: + """ + Get the versions of a DataDoc with pagination. + Args: + page (int): Page number. + Returns: + List[Dict]: A list of commit dictionaries. + """ + try: + commits = self.repo.get_commits( + path=self.file_path, + sha=self.branch, + ).get_page(page - 1) + return [commit.raw_data for commit in commits] + except GithubException as e: + LOG.error(f"GitHubException during get_datadoc_versions: {e}") + return [] + + def get_datadoc_at_commit(self, commit_sha: str) -> DataDoc: + """ + Get a DataDoc at a specific commit. + Args: + commit_sha (str): The commit SHA. + Returns: + DataDoc: The DataDoc object at the specified commit. + Raises: + Exception: If getting the DataDoc at the commit fails. + """ + try: + file_contents = self.repo.get_contents(path=self.file_path, ref=commit_sha) + content = file_contents.decoded_content.decode("utf-8") + return deserialize_datadoc_from_markdown(content) + except GithubException as e: + LOG.error(f"GitHubException during get_datadoc_at_commit: {e}") + raise Exception(f"Failed to get DataDoc at commit {commit_sha}: {e}") + + def get_repo_directories(self) -> List[str]: + """ + Get all directories in the repository. + Returns: + List[str]: A list of directory names. + """ + try: + contents = self.repo.get_contents("") + directories = [ + content.path for content in contents if content.type == "dir" + ] + return directories + except GithubException as e: + LOG.error(f"GitHubException during get_directories: {e}") + return [] diff --git a/querybook/server/datasources/github.py b/querybook/server/datasources/github.py index 84f2e2827..6fcc29618 100644 --- a/querybook/server/datasources/github.py +++ b/querybook/server/datasources/github.py @@ -1,10 +1,28 @@ -from app.datasource import register +from app.datasource import register, api_assert +from app.db import DBSession from lib.github_integration.github_integration import get_github_manager -from typing import Dict +from clients.github_client import GitHubClient +from functools import wraps +from typing import List, Dict, Optional +from logic import datadoc as datadoc_logic from logic import github as logic +from const.datasources import RESOURCE_NOT_FOUND_STATUS_CODE +from logic.datadoc_permission import assert_can_read, assert_can_write +from app.auth.permission import verify_data_doc_permission from flask_login import current_user +def with_github_client(f): + @wraps(f) + def decorated_function(*args, **kwargs): + datadoc_id = kwargs.get("datadoc_id") + github_link = logic.get_repo_link(datadoc_id) + github_client = GitHubClient(github_link) + return f(github_client, *args, **kwargs) + + return decorated_function + + @register("/github/auth/", methods=["GET"]) def connect_github() -> Dict[str, str]: github_manager = get_github_manager() @@ -12,7 +30,7 @@ def connect_github() -> Dict[str, str]: @register("/github/is_authenticated/", methods=["GET"]) -def is_github_authenticated() -> str: +def is_github_authenticated() -> Dict[str, bool]: github_manager = get_github_manager() is_authenticated = github_manager.get_github_token() is not None return {"is_authenticated": is_authenticated} @@ -23,6 +41,86 @@ def link_datadoc_to_github( datadoc_id: int, directory: str, ) -> Dict: - return logic.create_repo_link( - datadoc_id=datadoc_id, user_id=current_user.id, directory=directory - ) + with DBSession() as session: + datadoc = datadoc_logic.get_data_doc_by_id(datadoc_id, session=session) + api_assert( + datadoc is not None, + "DataDoc not found", + status_code=RESOURCE_NOT_FOUND_STATUS_CODE, + ) + assert_can_write(datadoc_id, session=session) + verify_data_doc_permission(datadoc_id, session=session) + + github_link = logic.create_repo_link( + datadoc_id=datadoc_id, user_id=current_user.id, directory=directory + ) + return github_link.to_dict() + + +@register("/github/datadocs//unlink/", methods=["DELETE"]) +def unlink_datadoc_from_github(datadoc_id: int) -> Dict: + with DBSession() as session: + datadoc = datadoc_logic.get_data_doc_by_id(datadoc_id, session=session) + api_assert( + datadoc is not None, + "DataDoc not found", + status_code=RESOURCE_NOT_FOUND_STATUS_CODE, + ) + assert_can_write(datadoc_id, session=session) + verify_data_doc_permission(datadoc_id, session=session) + + logic.delete_repo_link(datadoc_id) + return {"message": "Repository unlinked successfully"} + + +@register("/github/datadocs//is_linked/", methods=["GET"]) +def is_datadoc_linked(datadoc_id: int) -> Dict[str, bool]: + with DBSession() as session: + datadoc = datadoc_logic.get_data_doc_by_id(datadoc_id, session=session) + api_assert( + datadoc is not None, + "DataDoc not found", + status_code=RESOURCE_NOT_FOUND_STATUS_CODE, + ) + assert_can_read(datadoc_id, session=session) + verify_data_doc_permission(datadoc_id, session=session) + + is_linked = logic.is_repo_linked(datadoc_id) + return {"is_linked": is_linked} + + +@register("/github/datadocs//directories/", methods=["GET"]) +@with_github_client +def get_github_directories( + github_client: GitHubClient, datadoc_id: int +) -> Dict[str, List[str]]: + assert_can_read(datadoc_id) + verify_data_doc_permission(datadoc_id) + directories = github_client.get_repo_directories() + return {"directories": directories} + + +@register("/github/datadocs//commit/", methods=["POST"]) +@with_github_client +def commit_datadoc( + github_client: GitHubClient, + datadoc_id: int, + commit_message: Optional[str] = None, +) -> Dict: + with DBSession() as session: + assert_can_write(datadoc_id, session=session) + verify_data_doc_permission(datadoc_id, session=session) + github_client.commit_datadoc(commit_message=commit_message) + return {"message": "DataDoc committed successfully"} + + +@register("/github/datadocs//versions/", methods=["GET"]) +@with_github_client +def get_datadoc_versions( + github_client: GitHubClient, datadoc_id: int, limit: int = 5, offset: int = 0 +) -> List[Dict]: + assert_can_read(datadoc_id) + verify_data_doc_permission(datadoc_id) + page = offset // limit + 1 + versions = github_client.get_datadoc_versions(page=page) + return versions diff --git a/querybook/server/env.py b/querybook/server/env.py index 2128717a3..2e06f07a4 100644 --- a/querybook/server/env.py +++ b/querybook/server/env.py @@ -156,3 +156,6 @@ class QuerybookSettings(object): VECTOR_STORE_CONFIG = get_env_config("VECTOR_STORE_CONFIG") or {} EMBEDDINGS_PROVIDER = get_env_config("EMBEDDINGS_PROVIDER") EMBEDDINGS_CONFIG = get_env_config("EMBEDDINGS_CONFIG") or {} + + # GitHub Integration + GITHUB_REPO_URL = get_env_config("GITHUB_REPO_URL") diff --git a/querybook/server/logic/github.py b/querybook/server/logic/github.py index 3415dcc2e..def782d73 100644 --- a/querybook/server/logic/github.py +++ b/querybook/server/logic/github.py @@ -15,19 +15,23 @@ def create_repo_link( assert datadoc is not None, f"DataDoc with id {datadoc_id} not found" github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session) - assert ( - github_link is None - ), f"GitHub link for DataDoc with id {datadoc_id} already exists" - - github_link = GitHubLink.create( - { - "datadoc_id": datadoc_id, - "user_id": user_id, - "directory": directory, - }, - commit=commit, - session=session, - ) + if github_link is None: + github_link = GitHubLink.create( + { + "datadoc_id": datadoc_id, + "user_id": user_id, + "directory": directory, + }, + commit=commit, + session=session, + ) + else: + github_link = GitHubLink.update( + id=github_link.id, + fields={"directory": directory}, + commit=commit, + session=session, + ) return github_link @@ -38,3 +42,18 @@ def get_repo_link(datadoc_id: int, session=None): github_link is not None ), f"GitHub link for DataDoc with id {datadoc_id} not found" return github_link + + +@with_session +def delete_repo_link(datadoc_id: int, session=None): + github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session) + assert ( + github_link is not None + ), f"GitHub link for DataDoc with id {datadoc_id} not found" + GitHubLink.delete(id=github_link.id, commit=True, session=session) + + +@with_session +def is_repo_linked(datadoc_id: int, session=None) -> bool: + github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session) + return github_link is not None diff --git a/querybook/tests/test_lib/test_github_integration/test_github_client.py b/querybook/tests/test_lib/test_github_integration/test_github_client.py new file mode 100644 index 000000000..a76146170 --- /dev/null +++ b/querybook/tests/test_lib/test_github_integration/test_github_client.py @@ -0,0 +1,118 @@ +import pytest +from unittest.mock import MagicMock +from clients.github_client import GitHubClient +from models.datadoc import DataDoc +from models.github import GitHubLink +from github import GithubException + + +@pytest.fixture +def mock_flask_session(monkeypatch): + session = {} + monkeypatch.setattr("clients.github_client.flask_session", session) + return session + + +@pytest.fixture +def mock_github(monkeypatch): + mock_github = MagicMock() + monkeypatch.setattr("clients.github_client.Github", mock_github) + return mock_github + + +@pytest.fixture +def mock_github_link(): + return GitHubLink( + datadoc_id=1, + user_id=1, + directory="datadocs", + ) + + +@pytest.fixture +def mock_repo(): + return MagicMock() + + +def test_initialization(mock_flask_session, mock_github, mock_github_link, mock_repo): + mock_flask_session["github_access_token"] = "fake_token" + mock_github_instance = mock_github.return_value + mock_github_instance.get_repo.return_value = mock_repo + + client = GitHubClient(mock_github_link, DataDoc(id=1, title="Test Doc")) + assert client.client is not None + assert client.user is not None + assert client.repo is not None + + +def test_initialization_no_token(mock_flask_session, mock_github_link): + with pytest.raises(Exception) as excinfo: + GitHubClient(mock_github_link, DataDoc(id=1, title="Test Doc")) + assert "GitHub OAuth token not found in session" in str(excinfo.value) + + +def test_commit_datadoc_update( + mock_flask_session, mock_github, mock_github_link, mock_repo +): + mock_flask_session["github_access_token"] = "fake_token" + mock_github_instance = mock_github.return_value + mock_github_instance.get_repo.return_value = mock_repo + mock_repo.get_contents.return_value = MagicMock(sha="fake_sha") + + client = GitHubClient(mock_github_link, DataDoc(id=1, title="Test Doc")) + datadoc = DataDoc(id=1, title="Test Doc") + client.commit_datadoc(datadoc) + mock_repo.update_file.assert_called_once() + + +def test_commit_datadoc_create( + mock_flask_session, mock_github, mock_github_link, mock_repo +): + mock_flask_session["github_access_token"] = "fake_token" + mock_github_instance = mock_github.return_value + mock_github_instance.get_repo.return_value = mock_repo + mock_repo.get_contents.side_effect = GithubException(404, "Not Found", None) + + client = GitHubClient(mock_github_link, DataDoc(id=1, title="Test Doc")) + datadoc = DataDoc(id=1, title="Test Doc") + client.commit_datadoc(datadoc) + mock_repo.create_file.assert_called_once() + + +def test_get_datadoc_versions( + mock_flask_session, mock_github, mock_github_link, mock_repo +): + mock_flask_session["github_access_token"] = "fake_token" + mock_github_instance = mock_github.return_value + mock_github_instance.get_repo.return_value = mock_repo + mock_commit = MagicMock() + mock_commit.raw_data = {"sha": "123"} + mock_repo.get_commits.return_value = [mock_commit] + + client = GitHubClient(mock_github_link, DataDoc(id=1, title="Test Doc")) + versions = client.get_datadoc_versions() + assert len(versions) == 1 + assert versions[0]["sha"] == "123" + + +def test_get_datadoc_at_commit( + mock_flask_session, mock_github, mock_github_link, mock_repo +): + mock_flask_session["github_access_token"] = "fake_token" + mock_github_instance = mock_github.return_value + mock_github_instance.get_repo.return_value = mock_repo + mock_contents = mock_repo.get_contents.return_value + mock_contents.decoded_content = b"""--- +id: 1 +title: Test Doc +meta: {} +--- + +# Test Doc + +""" + + client = GitHubClient(mock_github_link, DataDoc(id=1, title="Test Doc")) + datadoc = client.get_datadoc_at_commit("commit_sha") + assert datadoc.id == 1 + assert datadoc.title == "Test Doc" diff --git a/requirements/extra.txt b/requirements/extra.txt index db5edd23e..04b77c8af 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -28,3 +28,6 @@ # AI Assistant -r ai/langchain.txt + +# Github +-r github_integration/github.txt diff --git a/requirements/github_integration/github.txt b/requirements/github_integration/github.txt new file mode 100644 index 000000000..9879283d5 --- /dev/null +++ b/requirements/github_integration/github.txt @@ -0,0 +1 @@ +pygithub==2.4.0