Skip to content

Commit

Permalink
Merge pull request #22 from dcppc/groupsio-emails
Browse files Browse the repository at this point in the history
adding calls to index groupsio emails
  • Loading branch information
charlesreid1 authored Aug 14, 2018
2 parents 1efa5bc + b6c89a2 commit 9d883b0
Show file tree
Hide file tree
Showing 8 changed files with 628 additions and 28 deletions.
22 changes: 13 additions & 9 deletions centillion.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,16 @@


class UpdateIndexTask(object):
def __init__(self, gh_access_token, diff_index=False):
def __init__(self, app_config, diff_index=False):
self.diff_index = diff_index
thread = threading.Thread(target=self.run, args=())
self.gh_access_token = gh_access_token

self.gh_token = app_config['GITHUB_TOKEN']
self.groupsio_credentials = {
'groupsio_token' : app_config['GROUPSIO_TOKEN'],
'groupsio_username' : app_config['GROUPSIO_USERNAME'],
'groupsio_password' : app_config['GROUPSIO_PASSWORD']
}
thread.daemon = True
thread.start()

Expand All @@ -43,9 +49,10 @@ def run(self):
from get_centillion_config import get_centillion_config
config = get_centillion_config('config_centillion.json')

search.update_index_ghfiles(self.gh_access_token,config)
search.update_index_issues(self.gh_access_token,config)
search.update_index_gdocs(config)
search.update_index_emailthreads(self.groupsio_credentials,config)
###search.update_index_ghfiles(self.gh_token,config)
###search.update_index_issues(self.gh_token,config)
###search.update_index_gdocs(config)



Expand Down Expand Up @@ -170,12 +177,9 @@ def update_index():
mresp = github.get('/teams/%s/members/%s'%(copper_team_id,username))
if mresp.status_code==204:

#gh_oauth_token = github.token['access_token']
gh_access_token = app.config['GITHUB_TOKEN']

# --------------------
# Business as usual
UpdateIndexTask(gh_access_token,
UpdateIndexTask(app.config,
diff_index=False)
flash("Rebuilding index, check console output")
return render_template("controlpanel.html",
Expand Down
151 changes: 137 additions & 14 deletions centillion_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import base64

from gdrive_util import GDrive
from groupsio_util import GroupsIOArchivesCrawler
from apiclient.http import MediaIoBaseDownload

import mistune
Expand Down Expand Up @@ -128,7 +129,6 @@ def open_index(self, index_folder, create_new=False):
schema = Schema(
id = ID(stored=True, unique=True),
kind = ID(stored=True),
#fingerprint = ID(stored=True),

created_time = ID(stored=True),
modified_time = ID(stored=True),
Expand Down Expand Up @@ -316,7 +316,7 @@ def add_drive_file(self, writer, item, temp_dir, config, update=False):
# to a search index.


def add_issue(self, writer, issue, gh_access_token, config, update=True):
def add_issue(self, writer, issue, gh_token, config, update=True):
"""
Add a Github issue/comment to a search index.
"""
Expand Down Expand Up @@ -368,8 +368,11 @@ def add_issue(self, writer, issue, gh_access_token, config, update=True):



# ------------------------------
# Add a single github file
# to a search index.

def add_ghfile(self, writer, d, gh_access_token, config, update=True):
def add_ghfile(self, writer, d, gh_token, config, update=True):
"""
Use a Github file API record to add a filename
to the search index.
Expand Down Expand Up @@ -401,7 +404,7 @@ def add_ghfile(self, writer, d, gh_access_token, config, update=True):
# don't forget the headers for private repos!
# useful: https://bit.ly/2LSAflS

headers = {'Authorization' : 'token %s'%(gh_access_token)}
headers = {'Authorization' : 'token %s'%(gh_token)}

response = requests.get(furl, headers=headers)
if response.status_code==200:
Expand Down Expand Up @@ -466,6 +469,41 @@ def add_ghfile(self, writer, d, gh_access_token, config, update=True):




# ------------------------------
# Add a single github file
# to a search index.

def add_emailthread(self, writer, d, config, update=True):
"""
Use a Github file API record to add a filename
to the search index.
"""
indexed_time = clean_timestamp(datetime.now())

# Now create the actual search index record
writer.add_document(
id = d['permalink'],
kind = 'emailthread',
created_time = '',
modified_time = '',
indexed_time = indexed_time,
title = d['subject'],
url = d['permalink'],
mimetype='',
owner_email='',
owner_name=d['original_sender'],
repo_name = '',
repo_url = '',
github_user = '',
issue_title = '',
issue_url = '',
content = d['content']
)




# ------------------------------
# Define how to update search index
# using different kinds of collections
Expand Down Expand Up @@ -590,7 +628,7 @@ def update_index_gdocs(self,
# ------------------------------
# Github Issues/Comments

def update_index_issues(self, gh_access_token, config):
def update_index_issues(self, gh_token, config):
"""
Update the search index using a collection of
Github repo issues and comments.
Expand All @@ -615,7 +653,7 @@ def update_index_issues(self, gh_access_token, config):
# Get the set of remote ids:
# ------
# Start with api object
g = Github(gh_access_token)
g = Github(gh_token)

# Now index all issue threads in the user-specified repos

Expand Down Expand Up @@ -669,7 +707,7 @@ def update_index_issues(self, gh_access_token, config):
# cop out
writer.delete_by_term('id',update_issue)
item = full_items[update_issue]
self.add_issue(writer, item, gh_access_token, config, update=True)
self.add_issue(writer, item, gh_token, config, update=True)
count += 1


Expand All @@ -678,7 +716,7 @@ def update_index_issues(self, gh_access_token, config):
add_issues = remote_issues - indexed_issues
for add_issue in add_issues:
item = full_items[add_issue]
self.add_issue(writer, item, gh_access_token, config, update=False)
self.add_issue(writer, item, gh_token, config, update=False)
count += 1


Expand All @@ -688,9 +726,9 @@ def update_index_issues(self, gh_access_token, config):


# ------------------------------
# Github Markdown Files
# Github Files

def update_index_ghfiles(self, gh_access_token, config):
def update_index_ghfiles(self, gh_token, config):
"""
Update the search index using a collection of
files (and, separately, Markdown files) from
Expand Down Expand Up @@ -721,7 +759,7 @@ def update_index_ghfiles(self, gh_access_token, config):
# Get the set of remote ids:
# ------
# Start with api object
g = Github(gh_access_token)
g = Github(gh_token)

# Now index all the files.

Expand Down Expand Up @@ -795,7 +833,7 @@ def update_index_ghfiles(self, gh_access_token, config):
# cop out: just delete and re-add
writer.delete_by_term('id',update_id)
item = full_items[update_id]
self.add_ghfile(writer, item, gh_access_token, config, update=True)
self.add_ghfile(writer, item, gh_token, config, update=True)
count += 1


Expand All @@ -804,7 +842,7 @@ def update_index_ghfiles(self, gh_access_token, config):
add_ids = remote_ids - indexed_ids
for add_id in add_ids:
item = full_items[add_id]
self.add_ghfile(writer, item, gh_access_token, config, update=False)
self.add_ghfile(writer, item, gh_token, config, update=False)
count += 1


Expand All @@ -817,10 +855,89 @@ def update_index_ghfiles(self, gh_access_token, config):
# Groups.io Emails


#def update_index_markdown(self, gh_access_token, config):
def update_index_emailthreads(self, groupsio_token, config):
"""
Update the search index using the email archives
of groups.io groups. This method looks deceptively
simple, all the logic is hidden in the spider
(groupsio_util.py).
RELEASE THE SPIDER!!!
"""
# Algorithm:
# - get set of indexed ids
# - get set of remote ids
# - drop indexed ids not in remote ids
# - index all remote ids

# Get the set of indexed ids:
# ------
indexed_ids = set()
p = QueryParser("kind", schema=self.ix.schema)
q = p.parse("emailthread")
with self.ix.searcher() as s:
results = s.search(q,limit=None)
for result in results:
indexed_ids.add(result['id'])

# Get the set of remote ids:
# ------
spider = GroupsIOArchivesCrawler(groupsio_token,'dcppc')

# ask spider to crawl the archives
spider.crawl_group_archives()

# now spider.archives is a list of dictionaries
# that each represent a thread:
# thread = {
# 'permalink' : permalink,
# 'subject' : subject,
# 'original_sender' : original_sender,
# 'content' : full_content
# }
#
# It is hard to reliablly extract more information
# than that from the email thread.

# archives is a dictionary
# keys are IDs (urls)
# values are dictionaries
archives = spider.get_archives()

# Start by collecting all the things
remote_ids = set()
for k in archives.keys():
remote_ids.add(k)

writer = self.ix.writer()
count = 0

# Drop any id in indexed_ids
# not in remote_ids
drop_ids = indexed_ids - remote_ids
for drop_id in drop_ids:
writer.delete_by_term('id',drop_id)

# Update any id in indexed_ids
# and in remote_ids
update_ids = indexed_ids & remote_ids
for update_id in update_ids:
# cop out: just delete and re-add
writer.delete_by_term('id',update_id)
item = archives[update_id]
self.add_emailthread(writer, item, config, update=True)
count += 1

# Add any issue not in indexed_ids
# and in remote_ids
add_ids = remote_ids - indexed_ids
for add_id in add_ids:
item = archives[add_id]
self.add_emailthread(writer, item, config, update=False)
count += 1

writer.commit()
print("Done, updated %d Groups.io email threads in the index" % count)


# ---------------------------------
Expand Down Expand Up @@ -935,6 +1052,7 @@ def get_document_total_count(self):
"issue" : None,
"ghfile" : None,
"markdown" : None,
"emailthread" : None,
"total" : None
}
for key in counts.keys():
Expand Down Expand Up @@ -962,6 +1080,8 @@ def get_list(self,doctype):
item_keys=''
if doctype=='gdoc':
item_keys = ['title','owner_name','url','mimetype']
elif doctype=='emailthread':
item_keys = ['title','owner_name','url']
elif doctype=='issue':
item_keys = ['title','repo_name','repo_url','url']
elif doctype=='ghfile':
Expand All @@ -987,6 +1107,9 @@ def get_list(self,doctype):


if __name__ == "__main__":

raise Exception("Error: main method not implemented (fix groupsio credentials first)")

search = Search("search_index")

from get_centillion_config import get_centillion_config
Expand Down
Loading

0 comments on commit 9d883b0

Please sign in to comment.