diff --git a/centillion.py b/centillion.py index 48cf8de..f4ff7c4 100644 --- a/centillion.py +++ b/centillion.py @@ -27,10 +27,16 @@ class UpdateIndexTask(object): - def __init__(self, gh_access_token, diff_index=False): + def __init__(self, app_config, diff_index=False): self.diff_index = diff_index thread = threading.Thread(target=self.run, args=()) - self.gh_access_token = gh_access_token + + self.gh_token = app_config['GITHUB_TOKEN'] + self.groupsio_credentials = { + 'groupsio_token' : app_config['GROUPSIO_TOKEN'], + 'groupsio_username' : app_config['GROUPSIO_USERNAME'], + 'groupsio_password' : app_config['GROUPSIO_PASSWORD'] + } thread.daemon = True thread.start() @@ -43,9 +49,10 @@ def run(self): from get_centillion_config import get_centillion_config config = get_centillion_config('config_centillion.json') - search.update_index_ghfiles(self.gh_access_token,config) - search.update_index_issues(self.gh_access_token,config) - search.update_index_gdocs(config) + search.update_index_emailthreads(self.groupsio_credentials,config) + ###search.update_index_ghfiles(self.gh_token,config) + ###search.update_index_issues(self.gh_token,config) + ###search.update_index_gdocs(config) @@ -170,12 +177,9 @@ def update_index(): mresp = github.get('/teams/%s/members/%s'%(copper_team_id,username)) if mresp.status_code==204: - #gh_oauth_token = github.token['access_token'] - gh_access_token = app.config['GITHUB_TOKEN'] - # -------------------- # Business as usual - UpdateIndexTask(gh_access_token, + UpdateIndexTask(app.config, diff_index=False) flash("Rebuilding index, check console output") return render_template("controlpanel.html", diff --git a/centillion_search.py b/centillion_search.py index 0003059..cdcefd2 100644 --- a/centillion_search.py +++ b/centillion_search.py @@ -5,6 +5,7 @@ import base64 from gdrive_util import GDrive +from groupsio_util import GroupsIOArchivesCrawler from apiclient.http import MediaIoBaseDownload import mistune @@ -128,7 +129,6 @@ def open_index(self, index_folder, create_new=False): schema = Schema( id = ID(stored=True, unique=True), kind = ID(stored=True), - #fingerprint = ID(stored=True), created_time = ID(stored=True), modified_time = ID(stored=True), @@ -316,7 +316,7 @@ def add_drive_file(self, writer, item, temp_dir, config, update=False): # to a search index. - def add_issue(self, writer, issue, gh_access_token, config, update=True): + def add_issue(self, writer, issue, gh_token, config, update=True): """ Add a Github issue/comment to a search index. """ @@ -368,8 +368,11 @@ def add_issue(self, writer, issue, gh_access_token, config, update=True): + # ------------------------------ + # Add a single github file + # to a search index. - def add_ghfile(self, writer, d, gh_access_token, config, update=True): + def add_ghfile(self, writer, d, gh_token, config, update=True): """ Use a Github file API record to add a filename to the search index. @@ -401,7 +404,7 @@ def add_ghfile(self, writer, d, gh_access_token, config, update=True): # don't forget the headers for private repos! # useful: https://bit.ly/2LSAflS - headers = {'Authorization' : 'token %s'%(gh_access_token)} + headers = {'Authorization' : 'token %s'%(gh_token)} response = requests.get(furl, headers=headers) if response.status_code==200: @@ -466,6 +469,41 @@ def add_ghfile(self, writer, d, gh_access_token, config, update=True): + + # ------------------------------ + # Add a single github file + # to a search index. + + def add_emailthread(self, writer, d, config, update=True): + """ + Use a Github file API record to add a filename + to the search index. + """ + indexed_time = clean_timestamp(datetime.now()) + + # Now create the actual search index record + writer.add_document( + id = d['permalink'], + kind = 'emailthread', + created_time = '', + modified_time = '', + indexed_time = indexed_time, + title = d['subject'], + url = d['permalink'], + mimetype='', + owner_email='', + owner_name=d['original_sender'], + repo_name = '', + repo_url = '', + github_user = '', + issue_title = '', + issue_url = '', + content = d['content'] + ) + + + + # ------------------------------ # Define how to update search index # using different kinds of collections @@ -590,7 +628,7 @@ def update_index_gdocs(self, # ------------------------------ # Github Issues/Comments - def update_index_issues(self, gh_access_token, config): + def update_index_issues(self, gh_token, config): """ Update the search index using a collection of Github repo issues and comments. @@ -615,7 +653,7 @@ def update_index_issues(self, gh_access_token, config): # Get the set of remote ids: # ------ # Start with api object - g = Github(gh_access_token) + g = Github(gh_token) # Now index all issue threads in the user-specified repos @@ -669,7 +707,7 @@ def update_index_issues(self, gh_access_token, config): # cop out writer.delete_by_term('id',update_issue) item = full_items[update_issue] - self.add_issue(writer, item, gh_access_token, config, update=True) + self.add_issue(writer, item, gh_token, config, update=True) count += 1 @@ -678,7 +716,7 @@ def update_index_issues(self, gh_access_token, config): add_issues = remote_issues - indexed_issues for add_issue in add_issues: item = full_items[add_issue] - self.add_issue(writer, item, gh_access_token, config, update=False) + self.add_issue(writer, item, gh_token, config, update=False) count += 1 @@ -688,9 +726,9 @@ def update_index_issues(self, gh_access_token, config): # ------------------------------ - # Github Markdown Files + # Github Files - def update_index_ghfiles(self, gh_access_token, config): + def update_index_ghfiles(self, gh_token, config): """ Update the search index using a collection of files (and, separately, Markdown files) from @@ -721,7 +759,7 @@ def update_index_ghfiles(self, gh_access_token, config): # Get the set of remote ids: # ------ # Start with api object - g = Github(gh_access_token) + g = Github(gh_token) # Now index all the files. @@ -795,7 +833,7 @@ def update_index_ghfiles(self, gh_access_token, config): # cop out: just delete and re-add writer.delete_by_term('id',update_id) item = full_items[update_id] - self.add_ghfile(writer, item, gh_access_token, config, update=True) + self.add_ghfile(writer, item, gh_token, config, update=True) count += 1 @@ -804,7 +842,7 @@ def update_index_ghfiles(self, gh_access_token, config): add_ids = remote_ids - indexed_ids for add_id in add_ids: item = full_items[add_id] - self.add_ghfile(writer, item, gh_access_token, config, update=False) + self.add_ghfile(writer, item, gh_token, config, update=False) count += 1 @@ -817,10 +855,89 @@ def update_index_ghfiles(self, gh_access_token, config): # Groups.io Emails - #def update_index_markdown(self, gh_access_token, config): + def update_index_emailthreads(self, groupsio_token, config): + """ + Update the search index using the email archives + of groups.io groups. This method looks deceptively + simple, all the logic is hidden in the spider + (groupsio_util.py). + + RELEASE THE SPIDER!!! + """ + # Algorithm: + # - get set of indexed ids + # - get set of remote ids + # - drop indexed ids not in remote ids + # - index all remote ids + + # Get the set of indexed ids: + # ------ + indexed_ids = set() + p = QueryParser("kind", schema=self.ix.schema) + q = p.parse("emailthread") + with self.ix.searcher() as s: + results = s.search(q,limit=None) + for result in results: + indexed_ids.add(result['id']) + + # Get the set of remote ids: + # ------ + spider = GroupsIOArchivesCrawler(groupsio_token,'dcppc') + + # ask spider to crawl the archives + spider.crawl_group_archives() + + # now spider.archives is a list of dictionaries + # that each represent a thread: + # thread = { + # 'permalink' : permalink, + # 'subject' : subject, + # 'original_sender' : original_sender, + # 'content' : full_content + # } + # + # It is hard to reliablly extract more information + # than that from the email thread. + + # archives is a dictionary + # keys are IDs (urls) + # values are dictionaries + archives = spider.get_archives() + + # Start by collecting all the things + remote_ids = set() + for k in archives.keys(): + remote_ids.add(k) + + writer = self.ix.writer() + count = 0 + + # Drop any id in indexed_ids + # not in remote_ids + drop_ids = indexed_ids - remote_ids + for drop_id in drop_ids: + writer.delete_by_term('id',drop_id) + # Update any id in indexed_ids + # and in remote_ids + update_ids = indexed_ids & remote_ids + for update_id in update_ids: + # cop out: just delete and re-add + writer.delete_by_term('id',update_id) + item = archives[update_id] + self.add_emailthread(writer, item, config, update=True) + count += 1 + # Add any issue not in indexed_ids + # and in remote_ids + add_ids = remote_ids - indexed_ids + for add_id in add_ids: + item = archives[add_id] + self.add_emailthread(writer, item, config, update=False) + count += 1 + writer.commit() + print("Done, updated %d Groups.io email threads in the index" % count) # --------------------------------- @@ -935,6 +1052,7 @@ def get_document_total_count(self): "issue" : None, "ghfile" : None, "markdown" : None, + "emailthread" : None, "total" : None } for key in counts.keys(): @@ -962,6 +1080,8 @@ def get_list(self,doctype): item_keys='' if doctype=='gdoc': item_keys = ['title','owner_name','url','mimetype'] + elif doctype=='emailthread': + item_keys = ['title','owner_name','url'] elif doctype=='issue': item_keys = ['title','repo_name','repo_url','url'] elif doctype=='ghfile': @@ -987,6 +1107,9 @@ def get_list(self,doctype): if __name__ == "__main__": + + raise Exception("Error: main method not implemented (fix groupsio credentials first)") + search = Search("search_index") from get_centillion_config import get_centillion_config diff --git a/groupsio_util.py b/groupsio_util.py new file mode 100644 index 0000000..38a3bc7 --- /dev/null +++ b/groupsio_util.py @@ -0,0 +1,376 @@ +import requests, os, re +from bs4 import BeautifulSoup + +class GroupsIOArchivesCrawler(object): + """ + This is a Groups.io spider + designed to crawl the email + archives of a group. + + credentials (dictionary): + groupsio_token : api access token + groupsio_username : username + groupsio_password : password + """ + def __init__(self, + credentials, + group_name): + # template url for archives page (list of topics) + self.url = "https://{group}.groups.io/g/{subgroup}/topics" + self.login_url = "https://groups.io/login" + + self.credentials = credentials + self.group_name = group_name + self.crawled_archives = False + self.archives = None + + + def get_archives(self): + """ + Return a list of dictionaries containing + information about each email topic in the + groups.io email archive. + + Call crawl_group_archives() first! + """ + return self.archives + + + def get_subgroups_list(self): + """ + Use the API to get a list of subgroups. + """ + subgroups_url = 'https://api.groups.io/v1/getsubgroups' + + key = self.credentials['groupsio_token'] + + data = [('group_name', self.group_name), + ('limit',100) + ] + response = requests.post(subgroups_url, + data=data, + auth=(key,'')) + response = response.json() + data = response['data'] + + subgroups = {} + for group in data: + k = group['id'] + v = re.sub(r'dcppc\+','',group['name']) + subgroups[k] = v + + return subgroups + + + def crawl_group_archives(self): + """ + Spider will crawl the email archives of the entire group + by crawling the email archives of each subgroup. + """ + self.archives = {} + + subgroups = self.get_subgroups_list() + + # ------------------------------ + # Start by logging in. + + # Create session object to persist session data + session = requests.Session() + + # Log in to the website + data = dict(email = self.credentials['groupsio_username'], + password = self.credentials['groupsio_password'], + timezone = 'America/Los_Angeles') + + r = session.post(self.login_url, + data = data) + + csrf = self.get_csrf(r) + + # ------------------------------ + # For each subgroup, crawl the archives + # and return a list of dictionaries + # containing all the email threads. + for subgroup_id in subgroups.keys(): + self.crawl_subgroup_archives(session, + csrf, + subgroup_id, + subgroups[subgroup_id]) + + # Done. archives are now tucked away + # in the variable self.archives + # + # self.archives is a dictionary of dictionaries, + # with each key a URL and each value a dictionary + # containing info about a thread. + # ------------------------------ + + + + + def crawl_subgroup_archives(self, session, csrf, subgroup_id, subgroup_name): + """ + This kicks off the process to crawl the entire + archives of a given subgroup on groups.io. + + For a given subgroup the url is self.url, + + https://{group}.groups.io/g/{subgroup}/topics + + This is the first of a paginated list of topics. + Procedure is: + - passed a starting page (or its contents) + - iterate through all topics via the HTML page elements + - assemble a bundle of information about each topic: + - topic title, by, URL, date, content, permalink + - content filtering: + - ^From, Reply-To, Date, To, Subject + - Lines containing phone numbers + - 9 digits + - XXX-XXX-XXXX, (XXX) XXX-XXXX + - XXXXXXXXXX, XXX XXX XXXX + - ^Work: or (Work) or Work$ + - Home, Cell, Mobile + - +1 XXX + - \w@\w + - while next button is not greyed out, + - click the next button + + everything stored in self.archives: + list of dictionaries. + + """ + prefix = "https://{group}.groups.io".format(group=self.group_name) + + url = self.url.format(group=self.group_name, + subgroup=subgroup_name) + + # ------------------------------ + + # Now get the first page + r = session.get(url) + + # ------------------------------ + # Fencepost algorithm: + + # First page: + + # Extract a list of (title, link) items + items = self.extract_archive_page_items_(r) + + # Get the next link + next_url = self.get_next_url_(r) + + # Now add each item to the archive of threads, + # then find the next button. + self.add_items_to_archives_(session,subgroup_name,items) + + if next_url is None: + return + else: + full_next_url = prefix + next_url + + # Now click the next button + next_request = requests.get(full_next_url) + + while next_request.status_code==200: + items = self.extract_archive_page_items_(next_request) + next_url = self.get_next_url_(next_request) + self.add_items_to_archives_(session,subgroup_name,items) + if next_url is None: + return + else: + full_next_url = prefix + next_url + next_request = requests.get(full_next_url) + + + + def add_items_to_archives_(self,session,subgroup_name,items): + """ + Given a set of items from a list of threads, + items being title and link, + get the page and store all info + in self.archives variable + (list of dictionaries) + """ + for (title, link) in items: + # Get the thread page: + prefix = "https://{group}.groups.io".format(group=self.group_name) + full_link = prefix + link + r = session.get(full_link) + soup = BeautifulSoup(r.text,'html.parser') + + # soup contains the entire thread + + # What are we extracting: + # 1. thread number + # 2. permalink + # 3. content/text (filtered) + + # - - - - - - - - - - - - - - + # 1. topic/thread number: + # + # where link is: + # https://{group}.groups.io/g/{subgroup}/topic/{topic_id} + # example topic id: 24209140 + # + # ugly links are in the form + # https://dcppc.groups.io/g/{subgroup}/topic/some_text_here/{thread_id}?p=,,,,,1,2,3,,,4,,5 + # split at ?, 0th portion + # then split at /, last (-1th) portion + topic_id = link.split('?')[0].split('/')[-1] + + # - - - - - - - - - - - - - - - + # 2. permalink: + # - current link is ugly link + # - permalink is the nice one + # - topic id is available from the ugly link + # https://{group}.groups.io/g/{subgroup}/topic/{topic_id} + + permalink_template = "https://{group}.groups.io/g/{subgroup}/topic/{topic_id}" + permalink = permalink_template.format( + group = self.group_name, + subgroup = subgroup_name, + topic_id = topic_id + ) + + # - - - - - - - - - - - - - - - + # 3. content: + + # Need to rearrange how we're assembling threads here. + # This is one thread, no? + content = [] + + subject = soup.find('title').text + + # Extract information for the schema: + # - permalink for thread (done) + # - subject/title (done) + # - original sender email/name (done) + # - content (done) + + # Groups.io pages have zero CSS classes, which makes everything + # a giant pain in the neck to interact with. Thanks Groups.io! + original_sender = '' + for i, tr in enumerate(soup.find_all('tr',{'class':'test'})): + # Every other tr row contains an email. + if (i+1)%2==0: + # nope, no email here + pass + else: + # found an email! + # this is a maze, thanks groups.io + td = tr.find('td') + divrow = td.find('div',{'class':'row'}).find('div',{'class':'pull-left'}) + if (i+1)==1: + original_sender = divrow.text.strip() + for div in td.find_all('div'): + if div.has_attr('id'): + + # purge any signatures + for x in div.find_all('div',{'id':'Signature'}): + x.extract() + + # purge any headers + for x in div.find_all('div'): + nonos = ['From:','Sent:','To:','Cc:','CC:','Subject:'] + for nono in nonos: + if nono in x.text: + x.extract() + + message_text = div.get_text() + + # More filtering: + + # phone numbers + message_text = re.sub(r'[0-9]{3}-[0-9]{3}-[0-9]{4}','XXX-XXX-XXXX',message_text) + message_text = re.sub(r'[0-9]\{10\}','XXXXXXXXXX',message_text) + + content.append(message_text) + + full_content = "\n".join(content) + + thread = { + 'permalink' : permalink, + 'subject' : subject, + 'original_sender' : original_sender, + 'content' : full_content + } + + print(" + Archiving thread: %s"%(thread['subject'])) + self.archives[permalink] = thread + + + def extract_archive_page_items_(self, response): + """ + (Private method) + + Given a response from a GET request, + use beautifulsoup to extract all items + (thread titles and ugly thread links) + and pass them back in a list. + """ + soup = BeautifulSoup(response.content,"html.parser") + rows = soup.find_all('tr',{'class':'test'}) + if 'rate limited' in soup.text: + raise Exception("Error: rate limit in place for Groups.io") + + results = [] + for row in rows: + # We don't care about anything except title and ugly link + subject = row.find('span',{'class':'subject'}) + title = subject.get_text() + link = row.find('a')['href'] + #print(title) + results.append((title,link)) + + return results + + + def get_next_url_(self, response): + """ + (Private method) + + Given a response (which is a list of threads), + find the next button and return the URL. + + If no next URL, if is disabled, then return None. + """ + soup = BeautifulSoup(response.text,'html.parser') + chevron = soup.find('i',{'class':'fa-chevron-right'}) + try: + if '#' in chevron.parent['href']: + # empty link, abort + return None + except AttributeError: + # I don't even now + return None + + if chevron.parent.parent.has_attr('class') and 'disabled' in chevron.parent.parent['class']: + # no next link, abort + return None + + return chevron.parent['href'] + + + + def get_csrf(self,resp): + """ + Find the CSRF token embedded in the subgroup page + """ + soup = BeautifulSoup(resp.text,'html.parser') + csrf = '' + for i in soup.find_all('input'): + # Note that i.name is different from i['name'] + # the first is the actual tag, + # the second is the attribute name="xyz" + if i['name']=='csrf': + csrf = i['value'] + + if csrf=='': + err = "ERROR: Could not find csrf token on page." + raise Exception(err) + + return csrf + + diff --git a/requirements.txt b/requirements.txt index f8e6421..d3e48b8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ pypandoc>=1.4 requests>=2.19 pandoc>=1.0 flask-dance>=1.0.0 +beautifulsoup4>=4.6 diff --git a/static/centillion_master_list.js b/static/centillion_master_list.js index d820fb0..af50668 100644 --- a/static/centillion_master_list.js +++ b/static/centillion_master_list.js @@ -26,6 +26,10 @@ $(document).ready(function() { load_gdoc_table(); var divList = $('div#collapseDrive').addClass('in'); + } else if (d==='emailthread') { + load_emailthreads_table(); + var divList = $('div#collapseThreads').addClass('in'); + } else if (d==='issue') { load_issue_table(); var divList = $('div#collapseIssues').addClass('in'); @@ -58,6 +62,7 @@ $(document).ready(function() { // Github issues // Github files // Github markdown +// Groups.io email threads // ------------------------ // Google Drive @@ -227,3 +232,42 @@ function load_markdown_table(){ } } + +// ------------------------ +// Groups.io Email Threads + +function load_emailthreads_table(){ + var divList = $('div#collapseThreads').attr('class'); + if (divList.indexOf('in') !== -1) { + console.log('Closing Groups.io email threads master list'); + } else { + console.log('Opening Groups.io email threads master list'); + + $.getJSON("/list/emailthread", function(result){ + var r = new Array(), j = -1, size=result.length; + r[++j] = '' + r[++j] = ''; + r[++j] = 'Topic'; + r[++j] = 'Started By'; + r[++j] = ''; + r[++j] = '' + r[++j] = '' + for (var i=0; i' + r[++j] = result[i]['title']; + r[++j] = '' + r[++j] = ''; + r[++j] = result[i]['owner_name']; + r[++j] = ''; + } + r[++j] = '' + $('#emailthreads-master-list').html(r.join('')); + $('#emailthreads-master-list').DataTable({ + responsive: true, + lengthMenu: [50,100,250,500] + }); + }); + console.log('Finished loading Groups.io email threads list'); + } +} diff --git a/static/style.css b/static/style.css index 7b74c87..d16b001 100755 --- a/static/style.css +++ b/static/style.css @@ -1,3 +1,8 @@ +#the-big-one { + margin-top: 10px; + margin-bottom: 10px; +} + span.badge { vertical-align: text-bottom; } diff --git a/templates/masterlist.html b/templates/masterlist.html index e20b0a7..9c18286 100755 --- a/templates/masterlist.html +++ b/templates/masterlist.html @@ -187,6 +187,41 @@

+ {# + # groups.io + #} +
+
+ +
+
+ + diff --git a/templates/search.html b/templates/search.html index ac37719..76a4546 100755 --- a/templates/search.html +++ b/templates/search.html @@ -32,11 +32,13 @@

-
- -
-
[clear all results] +

+

[clear all results] +

@@ -106,8 +108,12 @@

{{totals["markdown"]}} Github Markdown files - + , + {{totals["emailthread"]}} + + Groups.io email threads + @@ -152,6 +158,12 @@


Repository:
{{e.repo_name}} + {% elif e.kind=="emailthread" %} + Groups.io Email Thread: + {{e.title}} +
+ Started By: {{e.owner_name}} + {% else %} Item: (link)