Skip to content

Commit

Permalink
feat: github client
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangvi7 committed Oct 25, 2024
1 parent 3c37e44 commit e7f1935
Show file tree
Hide file tree
Showing 8 changed files with 415 additions and 19 deletions.
3 changes: 3 additions & 0 deletions querybook/config/querybook_default_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,6 @@ VECTOR_STORE_PROVIDER: ~
VECTOR_STORE_CONFIG:
embeddings_arg_name: 'embedding_function'
index_name: 'vector_index_v1'

# --------------- GitHub Integration ---------------
GITHUB_REPO_URL: ~
151 changes: 151 additions & 0 deletions querybook/server/clients/github_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
from flask import session as flask_session
from github import Github, GithubException, Auth
from typing import List, Dict, Optional

from lib.github_integration.serializers import (
deserialize_datadoc_from_markdown,
serialize_datadoc_to_markdown,
)
from lib.logger import get_logger
from models.datadoc import DataDoc
from models.github import GitHubLink
from env import QuerybookSettings

LOG = get_logger(__name__)


class GitHubClient:
def __init__(self, github_link: GitHubLink):
"""
Initialize the GitHub client with an access token from the session.
Raises an exception if the token is not found.
"""
self.github_link = github_link
self.datadoc = github_link.datadoc
access_token = self._get_access_token()
auth = Auth.Token(access_token)
self.client = Github(auth=auth, per_page=5)
self.user = self.client.get_user()
self.repo = self._get_repository()
self.branch = "main"
self.file_path = self._build_file_path()

def _get_access_token(self) -> str:
access_token = flask_session.get("github_access_token")
if not access_token:
LOG.error("GitHub OAuth token not found in session")
raise Exception("GitHub OAuth token not found in session")
return access_token

def _get_repository(self):
repo_url = QuerybookSettings.GITHUB_REPO_URL
if not repo_url:
LOG.error("GITHUB_REPO_URL is not configured")
raise Exception("GITHUB_REPO_URL is not configured")
repo_full_name = self._extract_repo_full_name(repo_url)
return self.client.get_repo(repo_full_name)

@staticmethod
def _extract_repo_full_name(repo_url: str) -> str:
# Assumes repo_url is in the format 'https://github.com/owner/repo'
parts = repo_url.rstrip("/").split("/")
if len(parts) >= 2:
return f"{parts[-2]}/{parts[-1]}"
else:
raise ValueError("Invalid GITHUB_REPO_URL configuration")

def _build_file_path(self) -> str:
directory = self.github_link.directory
file_name = f"datadoc_{self.datadoc.id}.md"
return f"{directory}/{file_name}"

def commit_datadoc(self, commit_message: Optional[str] = None):
"""
Commit a DataDoc to the repository.
Args:
commit_message (Optional[str]): Commit message. Defaults to a standard message.
Raises:
Exception: If committing the DataDoc fails.
"""
content = serialize_datadoc_to_markdown(self.datadoc)
if not commit_message:
commit_message = (
f"Update DataDoc {self.datadoc.id}: {self.datadoc.title or 'Untitled'}"
)

try:
contents = self.repo.get_contents(self.file_path, ref=self.branch)
# Update file
self.repo.update_file(
path=contents.path,
message=commit_message,
content=content,
sha=contents.sha,
branch=self.branch,
)
LOG.info(f"Updated file {self.file_path} in repository.")
except GithubException as e:
if e.status == 404:
# Create new file
self.repo.create_file(
path=self.file_path,
message=commit_message,
content=content,
branch=self.branch,
)
LOG.info(f"Created file {self.file_path} in repository.")
else:
LOG.error(f"GitHubException during commit: {e}")
raise Exception(f"Failed to commit DataDoc: {e}")

def get_datadoc_versions(self, page: int = 1) -> List[Dict]:
"""
Get the versions of a DataDoc with pagination.
Args:
page (int): Page number.
Returns:
List[Dict]: A list of commit dictionaries.
"""
try:
commits = self.repo.get_commits(
path=self.file_path,
sha=self.branch,
).get_page(page - 1)
return [commit.raw_data for commit in commits]
except GithubException as e:
LOG.error(f"GitHubException during get_datadoc_versions: {e}")
return []

def get_datadoc_at_commit(self, commit_sha: str) -> DataDoc:
"""
Get a DataDoc at a specific commit.
Args:
commit_sha (str): The commit SHA.
Returns:
DataDoc: The DataDoc object at the specified commit.
Raises:
Exception: If getting the DataDoc at the commit fails.
"""
try:
file_contents = self.repo.get_contents(path=self.file_path, ref=commit_sha)
content = file_contents.decoded_content.decode("utf-8")
return deserialize_datadoc_from_markdown(content)
except GithubException as e:
LOG.error(f"GitHubException during get_datadoc_at_commit: {e}")
raise Exception(f"Failed to get DataDoc at commit {commit_sha}: {e}")

def get_repo_directories(self) -> List[str]:
"""
Get all directories in the repository.
Returns:
List[str]: A list of directory names.
"""
try:
contents = self.repo.get_contents("")
directories = [
content.path for content in contents if content.type == "dir"
]
return directories
except GithubException as e:
LOG.error(f"GitHubException during get_directories: {e}")
return []
110 changes: 104 additions & 6 deletions querybook/server/datasources/github.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,36 @@
from app.datasource import register
from app.datasource import register, api_assert
from app.db import DBSession
from lib.github_integration.github_integration import get_github_manager
from typing import Dict
from clients.github_client import GitHubClient
from functools import wraps
from typing import List, Dict, Optional
from logic import datadoc as datadoc_logic
from logic import github as logic
from const.datasources import RESOURCE_NOT_FOUND_STATUS_CODE
from logic.datadoc_permission import assert_can_read, assert_can_write
from app.auth.permission import verify_data_doc_permission
from flask_login import current_user


def with_github_client(f):
@wraps(f)
def decorated_function(*args, **kwargs):
datadoc_id = kwargs.get("datadoc_id")
github_link = logic.get_repo_link(datadoc_id)
github_client = GitHubClient(github_link)
return f(github_client, *args, **kwargs)

return decorated_function


@register("/github/auth/", methods=["GET"])
def connect_github() -> Dict[str, str]:
github_manager = get_github_manager()
return github_manager.initiate_github_integration()


@register("/github/is_authenticated/", methods=["GET"])
def is_github_authenticated() -> str:
def is_github_authenticated() -> Dict[str, bool]:
github_manager = get_github_manager()
is_authenticated = github_manager.get_github_token() is not None
return {"is_authenticated": is_authenticated}
Expand All @@ -23,6 +41,86 @@ def link_datadoc_to_github(
datadoc_id: int,
directory: str,
) -> Dict:
return logic.create_repo_link(
datadoc_id=datadoc_id, user_id=current_user.id, directory=directory
)
with DBSession() as session:
datadoc = datadoc_logic.get_data_doc_by_id(datadoc_id, session=session)
api_assert(
datadoc is not None,
"DataDoc not found",
status_code=RESOURCE_NOT_FOUND_STATUS_CODE,
)
assert_can_write(datadoc_id, session=session)
verify_data_doc_permission(datadoc_id, session=session)

github_link = logic.create_repo_link(
datadoc_id=datadoc_id, user_id=current_user.id, directory=directory
)
return github_link.to_dict()


@register("/github/datadocs/<int:datadoc_id>/unlink/", methods=["DELETE"])
def unlink_datadoc_from_github(datadoc_id: int) -> Dict:
with DBSession() as session:
datadoc = datadoc_logic.get_data_doc_by_id(datadoc_id, session=session)
api_assert(
datadoc is not None,
"DataDoc not found",
status_code=RESOURCE_NOT_FOUND_STATUS_CODE,
)
assert_can_write(datadoc_id, session=session)
verify_data_doc_permission(datadoc_id, session=session)

logic.delete_repo_link(datadoc_id)
return {"message": "Repository unlinked successfully"}


@register("/github/datadocs/<int:datadoc_id>/is_linked/", methods=["GET"])
def is_datadoc_linked(datadoc_id: int) -> Dict[str, bool]:
with DBSession() as session:
datadoc = datadoc_logic.get_data_doc_by_id(datadoc_id, session=session)
api_assert(
datadoc is not None,
"DataDoc not found",
status_code=RESOURCE_NOT_FOUND_STATUS_CODE,
)
assert_can_read(datadoc_id, session=session)
verify_data_doc_permission(datadoc_id, session=session)

is_linked = logic.is_repo_linked(datadoc_id)
return {"is_linked": is_linked}


@register("/github/datadocs/<int:datadoc_id>/directories/", methods=["GET"])
@with_github_client
def get_github_directories(
github_client: GitHubClient, datadoc_id: int
) -> Dict[str, List[str]]:
assert_can_read(datadoc_id)
verify_data_doc_permission(datadoc_id)
directories = github_client.get_repo_directories()
return {"directories": directories}


@register("/github/datadocs/<int:datadoc_id>/commit/", methods=["POST"])
@with_github_client
def commit_datadoc(
github_client: GitHubClient,
datadoc_id: int,
commit_message: Optional[str] = None,
) -> Dict:
with DBSession() as session:
assert_can_write(datadoc_id, session=session)
verify_data_doc_permission(datadoc_id, session=session)
github_client.commit_datadoc(commit_message=commit_message)
return {"message": "DataDoc committed successfully"}


@register("/github/datadocs/<int:datadoc_id>/versions/", methods=["GET"])
@with_github_client
def get_datadoc_versions(
github_client: GitHubClient, datadoc_id: int, limit: int = 5, offset: int = 0
) -> List[Dict]:
assert_can_read(datadoc_id)
verify_data_doc_permission(datadoc_id)
page = offset // limit + 1
versions = github_client.get_datadoc_versions(page=page)
return versions
3 changes: 3 additions & 0 deletions querybook/server/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,6 @@ class QuerybookSettings(object):
VECTOR_STORE_CONFIG = get_env_config("VECTOR_STORE_CONFIG") or {}
EMBEDDINGS_PROVIDER = get_env_config("EMBEDDINGS_PROVIDER")
EMBEDDINGS_CONFIG = get_env_config("EMBEDDINGS_CONFIG") or {}

# GitHub Integration
GITHUB_REPO_URL = get_env_config("GITHUB_REPO_URL")
45 changes: 32 additions & 13 deletions querybook/server/logic/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,23 @@ def create_repo_link(
assert datadoc is not None, f"DataDoc with id {datadoc_id} not found"

github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session)
assert (
github_link is None
), f"GitHub link for DataDoc with id {datadoc_id} already exists"

github_link = GitHubLink.create(
{
"datadoc_id": datadoc_id,
"user_id": user_id,
"directory": directory,
},
commit=commit,
session=session,
)
if github_link is None:
github_link = GitHubLink.create(
{
"datadoc_id": datadoc_id,
"user_id": user_id,
"directory": directory,
},
commit=commit,
session=session,
)
else:
github_link = GitHubLink.update(
id=github_link.id,
fields={"directory": directory},
commit=commit,
session=session,
)
return github_link


Expand All @@ -38,3 +42,18 @@ def get_repo_link(datadoc_id: int, session=None):
github_link is not None
), f"GitHub link for DataDoc with id {datadoc_id} not found"
return github_link


@with_session
def delete_repo_link(datadoc_id: int, session=None):
github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session)
assert (
github_link is not None
), f"GitHub link for DataDoc with id {datadoc_id} not found"
GitHubLink.delete(id=github_link.id, commit=True, session=session)


@with_session
def is_repo_linked(datadoc_id: int, session=None) -> bool:
github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session)
return github_link is not None
Loading

0 comments on commit e7f1935

Please sign in to comment.