Merge pull request #22 from dcppc/groupsio-emails

adding calls to index groupsio emails
dcppc · Aug 14, 2018 · 9d883b0 · 9d883b0
2 parents 1efa5bc + b6c89a2
commit 9d883b0
Show file tree

Hide file tree

Showing 8 changed files with 628 additions and 28 deletions.
diff --git a/centillion.py b/centillion.py
@@ -27,10 +27,16 @@
 
 
 class UpdateIndexTask(object):
-    def __init__(self, gh_access_token, diff_index=False):
+    def __init__(self, app_config, diff_index=False):
         self.diff_index = diff_index
         thread = threading.Thread(target=self.run, args=())
-        self.gh_access_token = gh_access_token
+
+        self.gh_token = app_config['GITHUB_TOKEN']
+        self.groupsio_credentials = {
+                'groupsio_token' :     app_config['GROUPSIO_TOKEN'],
+                'groupsio_username' :  app_config['GROUPSIO_USERNAME'],
+                'groupsio_password' :  app_config['GROUPSIO_PASSWORD']
+        }
         thread.daemon = True
         thread.start()
 
@@ -43,9 +49,10 @@ def run(self):
         from get_centillion_config import get_centillion_config
         config = get_centillion_config('config_centillion.json')
 
-        search.update_index_ghfiles(self.gh_access_token,config)
-        search.update_index_issues(self.gh_access_token,config)
-        search.update_index_gdocs(config)
+        search.update_index_emailthreads(self.groupsio_credentials,config)
+        ###search.update_index_ghfiles(self.gh_token,config)
+        ###search.update_index_issues(self.gh_token,config)
+        ###search.update_index_gdocs(config)
 
 
 
@@ -170,12 +177,9 @@ def update_index():
                 mresp = github.get('/teams/%s/members/%s'%(copper_team_id,username))
                 if mresp.status_code==204:
 
-                    #gh_oauth_token = github.token['access_token']
-                    gh_access_token = app.config['GITHUB_TOKEN']
-
                     # --------------------
                     # Business as usual
-                    UpdateIndexTask(gh_access_token, 
+                    UpdateIndexTask(app.config,
                                     diff_index=False)
                     flash("Rebuilding index, check console output")
                     return render_template("controlpanel.html", 

diff --git a/centillion_search.py b/centillion_search.py
@@ -5,6 +5,7 @@
 import base64
 
 from gdrive_util import GDrive
+from groupsio_util import GroupsIOArchivesCrawler
 from apiclient.http import MediaIoBaseDownload
 
 import mistune
@@ -128,7 +129,6 @@ def open_index(self, index_folder, create_new=False):
         schema = Schema(
                 id = ID(stored=True, unique=True),
                 kind = ID(stored=True),
-                #fingerprint = ID(stored=True),
 
                 created_time = ID(stored=True),
                 modified_time = ID(stored=True),
@@ -316,7 +316,7 @@ def add_drive_file(self, writer, item, temp_dir, config, update=False):
     # to a search index.
 
 
-    def add_issue(self, writer, issue, gh_access_token, config, update=True):
+    def add_issue(self, writer, issue, gh_token, config, update=True):
         """
         Add a Github issue/comment to a search index.
         """
@@ -368,8 +368,11 @@ def add_issue(self, writer, issue, gh_access_token, config, update=True):
 
 
 
+    # ------------------------------
+    # Add a single github file 
+    # to a search index.
 
-    def add_ghfile(self, writer, d, gh_access_token, config, update=True):
+    def add_ghfile(self, writer, d, gh_token, config, update=True):
         """
         Use a Github file API record to add a filename
         to the search index.
@@ -401,7 +404,7 @@ def add_ghfile(self, writer, d, gh_access_token, config, update=True):
             # don't forget the headers for private repos!
             # useful: https://bit.ly/2LSAflS
 
-            headers = {'Authorization' : 'token %s'%(gh_access_token)}
+            headers = {'Authorization' : 'token %s'%(gh_token)}
 
             response = requests.get(furl, headers=headers)
             if response.status_code==200:
@@ -466,6 +469,41 @@ def add_ghfile(self, writer, d, gh_access_token, config, update=True):
 
 
 
+
+    # ------------------------------
+    # Add a single github file 
+    # to a search index.
+
+    def add_emailthread(self, writer, d, config, update=True):
+        """
+        Use a Github file API record to add a filename
+        to the search index.
+        """
+        indexed_time = clean_timestamp(datetime.now())
+
+        # Now create the actual search index record
+        writer.add_document(
+                id = d['permalink'],
+                kind = 'emailthread',
+                created_time = '',
+                modified_time = '',
+                indexed_time = indexed_time,
+                title = d['subject'],
+                url = d['permalink'],
+                mimetype='',
+                owner_email='',
+                owner_name=d['original_sender'],
+                repo_name = '',
+                repo_url = '',
+                github_user = '',
+                issue_title = '',
+                issue_url = '',
+                content = d['content']
+        )
+
+
+
+
     # ------------------------------
     # Define how to update search index
     # using different kinds of collections
@@ -590,7 +628,7 @@ def update_index_gdocs(self,
     # ------------------------------
     # Github Issues/Comments
 
-    def update_index_issues(self, gh_access_token, config):
+    def update_index_issues(self, gh_token, config):
         """
         Update the search index using a collection of 
         Github repo issues and comments.
@@ -615,7 +653,7 @@ def update_index_issues(self, gh_access_token, config):
         # Get the set of remote ids:
         # ------
         # Start with api object
-        g = Github(gh_access_token)
+        g = Github(gh_token)
 
         # Now index all issue threads in the user-specified repos
 
@@ -669,7 +707,7 @@ def update_index_issues(self, gh_access_token, config):
             # cop out
             writer.delete_by_term('id',update_issue)
             item = full_items[update_issue]
-            self.add_issue(writer, item, gh_access_token, config, update=True)
+            self.add_issue(writer, item, gh_token, config, update=True)
             count += 1
 
 
@@ -678,7 +716,7 @@ def update_index_issues(self, gh_access_token, config):
         add_issues = remote_issues - indexed_issues
         for add_issue in add_issues:
             item = full_items[add_issue]
-            self.add_issue(writer, item, gh_access_token, config, update=False)
+            self.add_issue(writer, item, gh_token, config, update=False)
             count += 1
 
 
@@ -688,9 +726,9 @@ def update_index_issues(self, gh_access_token, config):
 
 
     # ------------------------------
-    # Github Markdown Files
+    # Github Files
 
-    def update_index_ghfiles(self, gh_access_token, config): 
+    def update_index_ghfiles(self, gh_token, config): 
         """
         Update the search index using a collection of 
         files (and, separately, Markdown files) from 
@@ -721,7 +759,7 @@ def update_index_ghfiles(self, gh_access_token, config):
         # Get the set of remote ids:
         # ------
         # Start with api object
-        g = Github(gh_access_token)
+        g = Github(gh_token)
 
         # Now index all the files.
 
@@ -795,7 +833,7 @@ def update_index_ghfiles(self, gh_access_token, config):
             # cop out: just delete and re-add
             writer.delete_by_term('id',update_id)
             item = full_items[update_id]
-            self.add_ghfile(writer, item, gh_access_token, config, update=True)
+            self.add_ghfile(writer, item, gh_token, config, update=True)
             count += 1
 
 
@@ -804,7 +842,7 @@ def update_index_ghfiles(self, gh_access_token, config):
         add_ids = remote_ids - indexed_ids
         for add_id in add_ids:
             item = full_items[add_id]
-            self.add_ghfile(writer, item, gh_access_token, config, update=False)
+            self.add_ghfile(writer, item, gh_token, config, update=False)
             count += 1
 
 
@@ -817,10 +855,89 @@ def update_index_ghfiles(self, gh_access_token, config):
     # Groups.io Emails
 
 
-    #def update_index_markdown(self, gh_access_token, config): 
+    def update_index_emailthreads(self, groupsio_token, config):
+        """
+        Update the search index using the email archives
+        of groups.io groups. This method looks deceptively
+        simple, all the logic is hidden in the spider
+        (groupsio_util.py).
+
+        RELEASE THE SPIDER!!!
+        """
+        # Algorithm:
+        # - get set of indexed ids
+        # - get set of remote ids
+        # - drop indexed ids not in remote ids
+        # - index all remote ids
+
+        # Get the set of indexed ids:
+        # ------
+        indexed_ids = set()
+        p = QueryParser("kind", schema=self.ix.schema)
+        q = p.parse("emailthread")
+        with self.ix.searcher() as s:
+            results = s.search(q,limit=None)
+            for result in results:
+                indexed_ids.add(result['id'])
+
+        # Get the set of remote ids:
+        # ------
+        spider = GroupsIOArchivesCrawler(groupsio_token,'dcppc')
+
+        # ask spider to crawl the archives
+        spider.crawl_group_archives()
+
+        # now spider.archives is a list of dictionaries
+        # that each represent a thread:
+        #   thread = {
+        #           'permalink' : permalink,
+        #           'subject' : subject,
+        #           'original_sender' : original_sender,
+        #           'content' : full_content
+        #   }
+        #
+        # It is hard to reliablly extract more information
+        # than that from the email thread.
+
+        # archives is a dictionary
+        # keys are IDs (urls)
+        # values are dictionaries
+        archives = spider.get_archives()
+
+        # Start by collecting all the things
+        remote_ids = set()
+        for k in archives.keys():
+            remote_ids.add(k)
+
+        writer = self.ix.writer()
+        count = 0
+
+        # Drop any id in indexed_ids
+        # not in remote_ids
+        drop_ids = indexed_ids - remote_ids
+        for drop_id in drop_ids:
+            writer.delete_by_term('id',drop_id)
 
+        # Update any id in indexed_ids
+        # and in remote_ids
+        update_ids = indexed_ids & remote_ids
+        for update_id in update_ids:
+            # cop out: just delete and re-add
+            writer.delete_by_term('id',update_id)
+            item = archives[update_id]
+            self.add_emailthread(writer, item, config, update=True)
+            count += 1
 
+        # Add any issue not in indexed_ids
+        # and in remote_ids
+        add_ids = remote_ids - indexed_ids
+        for add_id in add_ids:
+            item = archives[add_id]
+            self.add_emailthread(writer, item, config, update=False)
+            count += 1
 
+        writer.commit()
+        print("Done, updated %d Groups.io email threads in the index" % count)
 
 
     # ---------------------------------
@@ -935,6 +1052,7 @@ def get_document_total_count(self):
                 "issue" : None,
                 "ghfile" : None,
                 "markdown" : None,
+                "emailthread" : None,
                 "total" : None
         }
         for key in counts.keys():
@@ -962,6 +1080,8 @@ def get_list(self,doctype):
         item_keys=''
         if doctype=='gdoc':
             item_keys = ['title','owner_name','url','mimetype']
+        elif doctype=='emailthread':
+            item_keys = ['title','owner_name','url']
         elif doctype=='issue':
             item_keys = ['title','repo_name','repo_url','url']
         elif doctype=='ghfile':
@@ -987,6 +1107,9 @@ def get_list(self,doctype):
 
 
 if __name__ == "__main__":
+
+    raise Exception("Error: main method not implemented (fix groupsio credentials first)")
+
     search = Search("search_index")
 
     from get_centillion_config import get_centillion_config