Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Querybook GitHub integration backend #2

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions querybook/server/clients/github_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from flask import session as flask_session
from github import Github, GithubException, Auth
from typing import List, Dict

from lib.github_integration.serializers import (
deserialize_datadoc,
serialize_datadoc,
)
from lib.logger import get_logger
from models.datadoc import DataDoc
from models.github import GitHubLink

LOG = get_logger(__file__)


class GitHubClient:
def __init__(self, github_link: GitHubLink):
"""
Initialize the GitHub client with an access token from the session.
Raises an exception if the token is not found.
"""
access_token = flask_session.get("github_access_token")
if not access_token:
raise Exception("GitHub OAuth token not found in session")
auth = Auth.Token(access_token)
self.client = Github(auth=auth)
self.user = self.client.get_user()
self.github_link = github_link
self.repo = self.client.get_repo(github_link.repo_url)

def commit_datadoc(self, datadoc: DataDoc):
"""
Commit a DataDoc to the repository.
Args:
datadoc (DataDoc): The DataDoc object to commit.
Raises:
Exception: If committing the DataDoc fails.
"""
file_path = self.github_link.file_path
content = serialize_datadoc(datadoc)
commit_message = f"Update DataDoc {datadoc.id}: {datadoc.title}"

try:
contents = self.repo.get_contents(file_path, ref=self.github_link.branch)
# Update file
self.repo.update_file(
path=contents.path,
message=commit_message,
content=content,
sha=contents.sha,
branch=self.github_link.branch,
)
LOG.info(f"Updated file {file_path} in repository.")
except GithubException as e:
if e.status == 404:
# Create new file
self.repo.create_file(
path=file_path,
message=commit_message,
content=content,
branch=self.github_link.branch,
)
LOG.info(f"Created file {file_path} in repository.")
else:
LOG.error(f"GitHubException: {e}")
raise Exception(f"Failed to commit DataDoc: {e}")

def get_datadoc_versions(self, datadoc: DataDoc) -> List[Dict]:
"""
Get the versions of a DataDoc.
Args:
datadoc (DataDoc): The DataDoc object.
Returns:
List[Dict]: A list of commit dictionaries.
"""
file_path = self.github_link.file_path
try:
commits = self.repo.get_commits(path=file_path, ref=self.github_link.branch)
return [commit.raw_data for commit in commits]
except GithubException as e:
LOG.error(f"GitHubException: {e}")
return []

def get_datadoc_at_commit(self, datadoc_id: int, commit_sha: str) -> DataDoc:
"""
Get a DataDoc at a specific commit.
Args:
datadoc_id (int): The DataDoc ID.
commit_sha (str): The commit SHA.
Returns:
DataDoc: The DataDoc object at the specified commit.
Raises:
Exception: If getting the DataDoc at the commit fails.
"""
file_path = self.github_link.file_path
try:
file_contents = self.repo.get_contents(path=file_path, ref=commit_sha)
json_content = file_contents.decoded_content.decode("utf-8")
return deserialize_datadoc(json_content)
except GithubException as e:
LOG.error(f"GitHubException: {e}")
raise Exception(f"Failed to get DataDoc at commit {commit_sha}: {e}")
92 changes: 90 additions & 2 deletions querybook/server/datasources/github.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,26 @@
from app.datasource import register
from app.datasource import register, api_assert
from app.db import DBSession
from lib.github_integration.github_integration import get_github_manager
from typing import Dict
from clients.github_client import GitHubClient
from functools import wraps
from typing import List, Dict
from logic import datadoc as datadoc_logic
from logic import github as logic
from const.datasources import RESOURCE_NOT_FOUND_STATUS_CODE
from logic.datadoc_permission import assert_can_read, assert_can_write
from app.auth.permission import verify_data_doc_permission
from flask_login import current_user


def with_github_client(f):
@wraps(f)
def decorated_function(*args, **kwargs):
datadoc_id = kwargs.get("datadoc_id")
github_link = logic.get_repo_link(datadoc_id)
github_client = GitHubClient(github_link)
return f(github_client, *args, **kwargs)

return decorated_function


@register("/github/auth/", methods=["GET"])
Expand All @@ -14,3 +34,71 @@ def is_github_authenticated() -> str:
github_manager = get_github_manager()
is_authenticated = github_manager.get_github_token() is not None
return {"is_authenticated": is_authenticated}


@register("/github/datadocs/<int:datadoc_id>/link/", methods=["POST"])
def link_datadoc_to_github(
datadoc_id: int,
repo_url: str,
branch: str,
file_path: str,
) -> Dict:
return logic.create_repo_link(
datadoc_id=datadoc_id,
user_id=current_user.id,
repo_url=repo_url,
branch=branch,
file_path=file_path,
)


@register("/github/datadocs/<int:datadoc_id>/commit/", methods=["POST"])
@with_github_client
def commit_datadoc(
github_client: GitHubClient,
datadoc_id: int,
) -> Dict:
with DBSession() as session:
datadoc = datadoc_logic.get_data_doc_by_id(datadoc_id, session=session)
api_assert(
datadoc is not None,
"DataDoc not found",
status_code=RESOURCE_NOT_FOUND_STATUS_CODE,
)
assert_can_write(datadoc_id, session=session)
verify_data_doc_permission(datadoc_id, session=session)
github_client.commit_datadoc(datadoc)
return {"message": "DataDoc committed successfully"}


@register("/github/datadocs/<int:datadoc_id>/versions/", methods=["GET"])
@with_github_client
def get_datadoc_versions(github_client: GitHubClient, datadoc_id: int) -> List[Dict]:
datadoc = datadoc_logic.get_data_doc_by_id(datadoc_id)
api_assert(
datadoc is not None,
"DataDoc not found",
status_code=RESOURCE_NOT_FOUND_STATUS_CODE,
)
assert_can_read(datadoc_id)
verify_data_doc_permission(datadoc_id)
versions = github_client.get_datadoc_versions(datadoc)
return versions


@register("/github/datadocs/<int:datadoc_id>/restore/", methods=["POST"])
@with_github_client
def restore_datadoc_version(
github_client: GitHubClient, datadoc_id: int, commit_sha: str
) -> Dict:
datadoc = datadoc_logic.get_data_doc_by_id(datadoc_id)
api_assert(
datadoc is not None,
"DataDoc not found",
status_code=RESOURCE_NOT_FOUND_STATUS_CODE,
)
assert_can_write(datadoc_id)
verify_data_doc_permission(datadoc_id)
restored_datadoc = github_client.get_datadoc_at_commit(datadoc.id, commit_sha)
saved_datadoc = datadoc_logic.restore_data_doc(restored_datadoc)
return saved_datadoc.to_dict(with_cells=True)
41 changes: 41 additions & 0 deletions querybook/server/lib/github_integration/serializers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import json
from models.datadoc import DataDoc, DataCell
from const.data_doc import DataCellType


def serialize_datadoc(datadoc: DataDoc) -> str:
datadoc_dict = datadoc.to_dict(with_cells=True)
return json.dumps(datadoc_dict, indent=4, default=str)


def deserialize_datadoc(json_content: str) -> DataDoc:
datadoc_dict = json.loads(json_content)
datadoc = DataDoc(
id=datadoc_dict.get("id"),
environment_id=datadoc_dict.get("environment_id"),
public=datadoc_dict.get("public", True),
archived=datadoc_dict.get("archived", False),
owner_uid=datadoc_dict.get("owner_uid"),
created_at=datadoc_dict.get("created_at"),
updated_at=datadoc_dict.get("updated_at"),
title=datadoc_dict.get("title", ""),
)

# Need to set the meta attribute directly
datadoc.meta = datadoc_dict.get("meta")

# Deserialize cells
cells_data = datadoc_dict.get("cells", [])
cells = []
for cell_dict in cells_data:
cell = DataCell(
id=cell_dict.get("id"),
cell_type=DataCellType[cell_dict.get("cell_type")],
context=cell_dict.get("context"),
meta=cell_dict.get("meta"),
created_at=cell_dict.get("created_at"),
updated_at=cell_dict.get("updated_at"),
)
cells.append(cell)
datadoc.cells = cells
return datadoc
40 changes: 40 additions & 0 deletions querybook/server/logic/datadoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,46 @@ def clone_data_doc(id, owner_uid, commit=True, session=None):
return new_data_doc


@with_session
def restore_data_doc(restored_datadoc: DataDoc, commit=True, session=None) -> DataDoc:
# Update the DataDoc fields
updated_datadoc = update_data_doc(
id=restored_datadoc.id,
commit=False,
session=session,
**{
"public": restored_datadoc.public,
"archived": restored_datadoc.archived,
"owner_uid": restored_datadoc.owner_uid,
"title": restored_datadoc.title,
"meta": restored_datadoc.meta,
},
)

# Update each DataCell
for restored_cell in restored_datadoc.cells:
update_data_cell(
id=restored_cell.id,
commit=False,
session=session,
**{
"context": restored_cell.context,
"meta": restored_cell.meta,
},
)

if commit:
session.commit()
update_es_data_doc_by_id(updated_datadoc.id)
update_es_queries_by_datadoc_id(updated_datadoc.id)
else:
session.flush()

session.refresh(updated_datadoc)

return updated_datadoc


"""
----------------------------------------------------------------------------------------------------------
DATA CELL
Expand Down
44 changes: 44 additions & 0 deletions querybook/server/logic/github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from app.db import with_session
from models.github import GitHubLink
from models.datadoc import DataDoc


@with_session
def create_repo_link(
datadoc_id: int,
user_id: int,
repo_url: str,
branch: str,
file_path: str,
commit=True,
session=None,
):
datadoc = DataDoc.get(id=datadoc_id, session=session)
assert datadoc is not None, f"DataDoc with id {datadoc_id} not found"

github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session)
assert (
github_link is None
), f"GitHub link for DataDoc with id {datadoc_id} already exists"

github_link = GitHubLink.create(
{
"datadoc_id": datadoc_id,
"user_id": user_id,
"repo_url": repo_url,
"branch": branch,
"file_path": file_path,
},
commit=commit,
session=session,
)
return github_link


@with_session
def get_repo_link(datadoc_id: int, session=None):
github_link = GitHubLink.get(datadoc_id=datadoc_id, session=session)
assert (
github_link is not None
), f"GitHub link for DataDoc with id {datadoc_id} not found"
return github_link
1 change: 1 addition & 0 deletions querybook/server/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
from .data_element import *
from .comment import *
from .survey import *
from .github import *
41 changes: 41 additions & 0 deletions querybook/server/models/github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import sqlalchemy as sql
from sqlalchemy.sql import func
from lib.sqlalchemy import CRUDMixin
from sqlalchemy.orm import backref, relationship
from app import db

Base = db.Base


class GitHubLink(Base, CRUDMixin):
__tablename__ = "github_link"
id = sql.Column(sql.Integer, primary_key=True, autoincrement=True)
datadoc_id = sql.Column(
sql.Integer, sql.ForeignKey("data_doc.id"), nullable=False, unique=True
)
user_id = sql.Column(sql.Integer, sql.ForeignKey("user.id"), nullable=False)
repo_url = sql.Column(sql.String(255), nullable=False)
branch = sql.Column(sql.String(255), nullable=False)
file_path = sql.Column(sql.String(255), nullable=False)
created_at = sql.Column(sql.DateTime, server_default=func.now(), nullable=False)
updated_at = sql.Column(
sql.DateTime, server_default=func.now(), onupdate=func.now(), nullable=False
)

datadoc = relationship(
"DataDoc",
backref=backref("github_link", uselist=False, cascade="all, delete-orphan"),
)
user = relationship("User", backref=backref("github_link", uselist=False))

def to_dict(self):
return {
"id": self.id,
"datadoc_id": self.datadoc_id,
"user_id": self.user_id,
"repo_url": self.repo_url,
"branch": self.branch,
"file_path": self.file_path,
"created_at": self.created_at,
"updated_at": self.updated_at,
}
Loading