Google sheets integration (#22)

Bunsly · Aug 28, 2023 · 65bfcb1 · 65bfcb1
1 parent d10dce6
commit 65bfcb1
Show file tree

Hide file tree

Showing 12 changed files with 185 additions and 35 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -63,11 +63,13 @@ jobs:
 
     - name: Check error field in response
       run: |
+        global_error=$(jq '.error' response.json)
         indeed_error=$(jq '.indeed.error' response.json)
         linkedin_error=$(jq '.linkedin.error' response.json)
 
         if [[ "$indeed_error" != "null" || "$linkedin_error" != "null" ]]; then
           echo "Error found in response:"
+          echo "Global Error: $global_error"
           echo "Indeed Error: $indeed_error"
           echo "LinkedIn Error: $linkedin_error"
           exit 1

diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,5 @@
 /ven/
 **/__pycache__/
 *.pyc
-.env
+.env
+client_secret.json
diff --git a/README.md b/README.md
@@ -4,8 +4,10 @@
 
 - Scrapes job postings from **LinkedIn**, **Indeed** & **ZipRecruiter** simultaneously
 - Returns jobs as JSON or CSV with title, location, company, description & other data
+- Imports directly into **Google Sheets**
 - Optional JWT authorization
 
+![jobspy_gsheet](https://github.com/cullenwatson/JobSpy/assets/78247585/9f0a997c-4e33-4167-b04e-31ab1f606edb)
 
 ### API
 
@@ -23,7 +25,7 @@ Optional
 ├── is_remote (bool)
 ├── results_wanted (int): per site_type
 ├── easy_apply (bool): only for linkedin
-└── output_format (enum): json, csv
+└── output_format (enum): json, csv, gsheet
 ```
 
 ### Request Example
@@ -34,6 +36,7 @@ Optional
 "distance": 10,
 "job_type": "fulltime",
 "results_wanted": 15
+"output_format": "gsheet"
 ```
 
 ### Response Schema
@@ -63,7 +66,16 @@ JobResponse
 ├── total_results (int)
 └── returned_results (int) 
 ```
-
+### Response Example (GOOGLE SHEETS)
+```json
+{
+    "status": "Successfully uploaded to Google Sheets",
+    "error": null,
+    "linkedin": null,
+    "indeed": null,
+    "zip_recruiter": null
+}
+```
 ### Response Example (JSON)
 ```json
 {
@@ -132,15 +144,33 @@ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/)
 4. Run the server with `uvicorn main:app --reload`
 
 ## Usage
+### Google Sheets Integration (Optional)
 
-### Swagger UI:
-To interact with the API documentation, navigate to [localhost:8000/docs](http://localhost:8000/docs).
+#### Obtaining an Access Key : [Video Guide](https://www.youtube.com/watch?v=w533wJuilao)
+  * Enable the [Google Sheets & Google Drive API](https://console.cloud.google.com/)
+  * Create credentials -> service account -> create & continue
+  * Select role -> basic: editor -> done
+  * Click on the email you just created in the service account list
+  * Go to the Keys tab -> add key -> create new key -> JSON -> Create
 
-### Postman:
+#### Using the key in the repo
+  * Copy the key file into the JobSpy repo as `/client_secret.json`
+  * Go to [my template sheet](https://docs.google.com/spreadsheets/d/1HAnn-aPv-BO4QTEzfIWc-5iw50duyMoTgX8o3RsEOWs/edit?usp=sharing) & save as a copy into your account
+  * Share the sheet with the email from the service account above with editor rights
+  * If you changed the name of the sheet, put the name in `GSHEET_NAME` in `/settings.py`
+
+### How to call the API
+
+
+
+#### [Postman](https://www.postman.com/downloads/) (preferred):
 To use Postman:
 1. Locate the files in the `/postman/` directory.
 2. Import the Postman collection and environment JSON files.
 
+#### Swagger UI:
+Or you can call the API with the interactive documentation at [localhost:8000/docs](http://localhost:8000/docs).
+
 ## FAQ
 
 ### I'm having issues with my queries. What should I do?

diff --git a/api/auth/auth_utils.py b/api/auth/auth_utils.py
@@ -4,11 +4,9 @@
 from fastapi import HTTPException, status, Depends
 from fastapi.security import OAuth2PasswordBearer
 
-from settings import *
 from api.core.users import TokenData
 from api.auth.db_utils import UserInDB, get_user
 
-load_dotenv()
 oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/auth/token")
 
 

diff --git a/api/core/formatters/__init__.py b/api/core/formatters/__init__.py
@@ -4,3 +4,4 @@
 class OutputFormat(Enum):
     CSV = "csv"
     JSON = "json"
+    GSHEET = "gsheet"
diff --git a/api/core/formatters/csv/__init__.py b/api/core/formatters/csv/__init__.py
@@ -1,19 +1,50 @@
+import gspread
+from oauth2client.service_account import ServiceAccountCredentials
+
 import csv
 from io import StringIO
 from datetime import datetime
 
 from ...jobs import *
 from ...scrapers import *
+from settings import *
 
 
-def generate_filename() -> str:
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    return f"JobSpy_results_{timestamp}.csv"
+class CSVFormatter:
+    @staticmethod
+    def upload_to_google_sheet(csv_data: str):
+        try:
+            scope = [
+                "https://www.googleapis.com/auth/spreadsheets",
+                "https://www.googleapis.com/auth/drive.file",
+                "https://www.googleapis.com/auth/drive",
+            ]
+            credentials = ServiceAccountCredentials.from_json_keyfile_name(
+                GSHEET_JSON_KEY_PATH, scope
+            )
+            gc = gspread.authorize(credentials)
+            sh = gc.open(GSHEET_NAME)
 
+            worksheet = sh.get_worksheet(0)
+            data_string = csv_data.getvalue()
+            reader = csv.reader(StringIO(data_string))
+
+            rows = list(reader)
+
+            for i, row in enumerate(rows):
+                if i == 0:
+                    continue
+                worksheet.append_row(row)
+        except Exception as e:
+            raise e
+
+    @staticmethod
+    def generate_filename() -> str:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        return f"JobSpy_results_{timestamp}.csv"
 
-class CSVFormatter:
     @staticmethod
-    def format(jobs: ScraperResponse) -> StringIO:
+    def format(jobs: CommonResponse) -> StringIO:
         """
         Transfomr the jobs objects into csv
         :param jobs:
@@ -41,7 +72,7 @@ def format(jobs: ScraperResponse) -> StringIO:
         writer.writerow(headers)
 
         for site, job_response in jobs.dict().items():
-            if job_response and job_response.get("success"):
+            if isinstance(job_response, dict) and job_response.get("success"):
                 for job in job_response["jobs"]:
                     writer.writerow(
                         [

diff --git a/api/core/jobs/__init__.py b/api/core/jobs/__init__.py
@@ -55,12 +55,13 @@ class JobResponse(BaseModel):
     success: bool
     error: str = None
 
+    total_results: int = None
+
     jobs: list[JobPost] = []
 
-    total_results: int = None
     returned_results: int = None
 
-    @validator("returned_results")
+    @validator("returned_results", pre=True, always=True)
     def set_returned_results(cls, v, values):
         if v is None and values.get("jobs"):
             return len(values["jobs"])

diff --git a/api/core/scrapers/__init__.py b/api/core/scrapers/__init__.py
@@ -1,6 +1,6 @@
 from ..jobs import *
 from ..formatters import OutputFormat
-from typing import List, Dict, Optional
+from typing import List, Dict, Optional, Any
 
 
 class StatusException(Exception):
@@ -28,10 +28,12 @@ class ScraperInput(BaseModel):
     results_wanted: int = 15
 
 
-class ScraperResponse(BaseModel):
-    linkedin: Optional[JobResponse]
-    indeed: Optional[JobResponse]
-    zip_recruiter: Optional[JobResponse]
+class CommonResponse(BaseModel):
+    status: Optional[str]
+    error: Optional[str]
+    linkedin: Optional[Any] = None
+    indeed: Optional[Any] = None
+    zip_recruiter: Optional[Any] = None
 
 
 class Scraper:

diff --git a/api/v1/jobs/__init__.py b/api/v1/jobs/__init__.py
@@ -6,13 +6,13 @@
 from api.core.scrapers.indeed import IndeedScraper
 from api.core.scrapers.ziprecruiter import ZipRecruiterScraper
 from api.core.scrapers.linkedin import LinkedInScraper
-from api.core.formatters.csv import CSVFormatter, generate_filename
+from api.core.formatters.csv import CSVFormatter
 from api.core.scrapers import (
     ScraperInput,
     Site,
     JobResponse,
     OutputFormat,
-    ScraperResponse,
+    CommonResponse,
 )
 from typing import List, Dict, Tuple, Union
 
@@ -26,7 +26,7 @@
 
 
 @router.post("/")
-async def scrape_jobs(scraper_input: ScraperInput) -> ScraperResponse:
+async def scrape_jobs(scraper_input: ScraperInput) -> CommonResponse:
     """
     Asynchronously scrapes job data from multiple job sites.
     :param scraper_input:
@@ -42,14 +42,26 @@ def scrape_site(site: Site) -> Tuple[str, JobResponse]:
     with ThreadPoolExecutor() as executor:
         results = dict(executor.map(scrape_site, scraper_input.site_type))
 
-    scraper_response = ScraperResponse(**results)
+    scraper_response = CommonResponse(status="JSON response success", **results)
 
     if scraper_input.output_format == OutputFormat.CSV:
         csv_output = CSVFormatter.format(scraper_response)
         response = StreamingResponse(csv_output, media_type="text/csv")
         response.headers[
             "Content-Disposition"
-        ] = f"attachment; filename={generate_filename()}"
+        ] = f"attachment; filename={CSVFormatter.generate_filename()}"
         return response
 
-    return scraper_response
+    elif scraper_input.output_format == OutputFormat.GSHEET:
+        csv_output = CSVFormatter.format(scraper_response)
+        try:
+            CSVFormatter.upload_to_google_sheet(csv_output)
+            return CommonResponse(status="Successfully uploaded to Google Sheets")
+
+        except Exception as e:
+            return CommonResponse(
+                status="Failed to upload to Google Sheet", error=str(e)
+            )
+
+    else:
+        return scraper_response
diff --git a/postman/JobSpy.postman_collection.json b/postman/JobSpy.postman_collection.json
diff --git a/requirements.txt b/requirements.txt
@@ -1,39 +1,51 @@
 anyio==3.7.1
+atomicwrites==1.4.1
 attrs==23.1.0
 bcrypt==4.0.1
 beautifulsoup4==4.12.2
+cachetools==5.3.1
 certifi==2023.5.7
 cffi==1.15.1
 chardet==4.0.0
 charset-normalizer==3.2.0
 click==8.1.4
+colorama==0.4.6
 cryptography==41.0.1
 dataclasses==0.6
 deprecation==2.1.0
 ecdsa==0.18.0
 exceptiongroup==1.1.2
 fastapi==0.99.1
+google-auth==2.22.0
+google-auth-oauthlib==1.0.0
 gotrue==0.2.0
+gspread==5.10.0
 h11==0.14.0
 httpcore==0.12.3
+httplib2==0.22.0
 httpx==0.16.1
 idna==2.10
 iniconfig==2.0.0
+oauth2client==4.1.3
+oauthlib==3.2.2
 packaging==23.1
 passlib==1.7.4
 pluggy==1.2.0
 postgrest-py==0.4.0
 py==1.11.0
 pyasn1==0.5.0
+pyasn1-modules==0.3.0
 pycparser==2.21
 pydantic==1.10.11
+pyparsing==3.1.1
 pytest==6.2.5
 python-dateutil==2.8.2
 python-dotenv==1.0.0
 python-jose==3.3.0
 python-multipart==0.0.6
 realtime-py==0.1.3
 requests==2.25.1
+requests-oauthlib==1.3.1
 rfc3986==1.5.0
 rsa==4.9
 six==1.16.0

diff --git a/settings.py b/settings.py
@@ -2,9 +2,14 @@
 import os
 
 load_dotenv()
+# gsheets (template to copy at https://docs.google.com/spreadsheets/d/1HAnn-aPv-BO4QTEzfIWc-5iw50duyMoTgX8o3RsEOWs/edit?usp=sharing)
+GSHEET_JSON_KEY_PATH = "client_secret.json"
+GSHEET_NAME = "JobSpy"
+
+# optional autha
+AUTH_REQUIRED = False
 SUPABASE_URL = os.environ.get("SUPABASE_URL")
 SUPABASE_KEY = os.environ.get("SUPABASE_KEY")
 JWT_SECRET_KEY = os.environ.get("JWT_SECRET_KEY")
-ALGORITHM = "HS256"
 ACCESS_TOKEN_EXPIRE_MINUTES = 60
-AUTH_REQUIRED = False
+ALGORITHM = "HS256"