Chore: Add anonymous user ID in tracking events (#124)

airbytehq · Mar 12, 2024 · 2f483ec · 2f483ec
1 parent 1e45e42
commit 2f483ec
Show file tree

Hide file tree

Showing 11 changed files with 195 additions and 5 deletions.
diff --git a/.github/workflows/autofix.yml b/.github/workflows/autofix.yml
@@ -5,6 +5,9 @@ on:
   repository_dispatch:
     types: [autofix-command]
 
+env:
+  AIRBYTE_ANALYTICS_ID: ${{ vars.AIRBYTE_ANALYTICS_ID }}
+
 jobs:
   python-autofix:
     runs-on: ubuntu-latest

diff --git a/.github/workflows/pydoc_preview.yml b/.github/workflows/pydoc_preview.yml
@@ -6,6 +6,8 @@ on:
     - main
   pull_request: {}
 
+env:
+  AIRBYTE_ANALYTICS_ID: ${{ vars.AIRBYTE_ANALYTICS_ID }}
 
 jobs:
   preview_docs:

diff --git a/.github/workflows/pydoc_publish.yml b/.github/workflows/pydoc_publish.yml
@@ -8,6 +8,9 @@ on:
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
 
+env:
+  AIRBYTE_ANALYTICS_ID: ${{ vars.AIRBYTE_ANALYTICS_ID }}
+
 # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
 permissions:
   contents: read

diff --git a/.github/workflows/pypi_publish.yml b/.github/workflows/pypi_publish.yml
@@ -5,6 +5,9 @@ on:
 
   workflow_dispatch:
 
+env:
+  AIRBYTE_ANALYTICS_ID: ${{ vars.AIRBYTE_ANALYTICS_ID }}
+
 jobs:
   build:
     runs-on: ubuntu-latest

diff --git a/.github/workflows/python_lint.yml b/.github/workflows/python_lint.yml
@@ -6,6 +6,9 @@ on:
       - main
     pull_request: {}
 
+env:
+  AIRBYTE_ANALYTICS_ID: ${{ vars.AIRBYTE_ANALYTICS_ID }}
+
 jobs:
   ruff-lint-check:
     name: Ruff Lint Check

diff --git a/.github/workflows/python_pytest.yml b/.github/workflows/python_pytest.yml
@@ -13,6 +13,9 @@ on:
       - main
     pull_request: {}
 
+env:
+  AIRBYTE_ANALYTICS_ID: ${{ vars.AIRBYTE_ANALYTICS_ID }}
+
 jobs:
   pytest-fast:
     name: Pytest (Fast)

diff --git a/.github/workflows/release_drafter.yml b/.github/workflows/release_drafter.yml
@@ -5,6 +5,9 @@ on:
     branches:
       - main
 
+env:
+  AIRBYTE_ANALYTICS_ID: ${{ vars.AIRBYTE_ANALYTICS_ID }}
+
 permissions:
   contents: read
 

diff --git a/.github/workflows/semantic_pr_check.yml b/.github/workflows/semantic_pr_check.yml
@@ -7,6 +7,9 @@ on:
       - edited
       - synchronize
 
+env:
+  AIRBYTE_ANALYTICS_ID: ${{ vars.AIRBYTE_ANALYTICS_ID }}
+
 permissions:
   pull-requests: read
 

diff --git a/.github/workflows/slash_command_dispatch.yml b/.github/workflows/slash_command_dispatch.yml
@@ -4,6 +4,9 @@ on:
   issue_comment:
     types: [created]
 
+env:
+  AIRBYTE_ANALYTICS_ID: ${{ vars.AIRBYTE_ANALYTICS_ID }}
+
 jobs:
   slashCommandDispatch:
     runs-on: ubuntu-latest

diff --git a/airbyte/_util/telemetry.py b/airbyte/_util/telemetry.py
@@ -37,10 +37,12 @@
 from dataclasses import asdict, dataclass
 from enum import Enum
 from functools import lru_cache
-from typing import TYPE_CHECKING, Any
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, cast
 
 import requests
 import ulid
+import yaml
 
 from airbyte import exceptions as exc
 from airbyte._util import meta
@@ -52,6 +54,10 @@
     from airbyte.sources.base import Source
 
 
+DEBUG = True
+"""Enable debug mode for telemetry code."""
+
+
 HASH_SEED = "PyAirbyte:"
 """Additional seed for randomizing one-way hashed strings."""
 
@@ -73,6 +79,92 @@
 DO_NOT_TRACK = "DO_NOT_TRACK"
 """Environment variable to opt-out of telemetry."""
 
+_ENV_ANALYTICS_ID = "AIRBYTE_ANALYTICS_ID"  # Allows user to override the anonymous user ID
+_ANALYTICS_FILE = Path.home() / ".airbyte" / "analytics.yml"
+_ANALYTICS_ID: str | bool | None = None
+
+
+def _setup_analytics() -> str | bool:
+    """Set up the analytics file if it doesn't exist.
+
+    Return the anonymous user ID or False if the user has opted out.
+    """
+    anonymous_user_id: str | None = None
+    issues: list[str] = []
+
+    if os.environ.get(DO_NOT_TRACK):
+        # User has opted out of tracking.
+        return False
+
+    if _ENV_ANALYTICS_ID in os.environ:
+        # If the user has chosen to override their analytics ID, use that value and
+        # remember it for future invocations.
+        anonymous_user_id = os.environ[_ENV_ANALYTICS_ID]
+
+    if not _ANALYTICS_FILE.exists():
+        # This is a one-time message to inform the user that we are tracking anonymous usage stats.
+        print(
+            "Anonymous usage reporting is enabled. For more information or to opt out, please"
+            " see https://docs.airbyte.io/pyairbyte/anonymized-usage-statistics"
+        )
+
+    if _ANALYTICS_FILE.exists():
+        analytics_text = _ANALYTICS_FILE.read_text()
+        try:
+            analytics: dict = yaml.safe_load(analytics_text)
+        except Exception as ex:
+            issues += f"File appears corrupted. Error was: {ex!s}"
+
+        if analytics and "anonymous_user_id" in analytics:
+            # The analytics ID was successfully located.
+            if not anonymous_user_id:
+                return analytics["anonymous_user_id"]
+
+            if anonymous_user_id == analytics["anonymous_user_id"]:
+                # Values match, no need to update the file.
+                return analytics["anonymous_user_id"]
+
+            issues.append("Provided analytics ID did not match the file. Rewriting the file.")
+            print(
+                f"Received a user-provided analytics ID override in the '{_ENV_ANALYTICS_ID}' "
+                "environment variable."
+            )
+
+    # File is missing, incomplete, or stale. Create a new one.
+    anonymous_user_id = anonymous_user_id or str(ulid.ULID())
+    try:
+        _ANALYTICS_FILE.parent.mkdir(exist_ok=True, parents=True)
+        _ANALYTICS_FILE.write_text(
+            "# This file is used by PyAirbyte to track anonymous usage statistics.\n"
+            "# For more information or to opt out, please see\n"
+            "# - https://docs.airbyte.com/operator-guides/telemetry\n"
+            f"anonymous_user_id: {anonymous_user_id}\n"
+        )
+    except Exception:
+        # Failed to create the analytics file. Likely due to a read-only filesystem.
+        issues.append("Failed to write the analytics file. Check filesystem permissions.")
+        pass
+
+    if DEBUG and issues:
+        nl = "\n"
+        print(f"One or more issues occurred when configuring usage tracking:\n{nl.join(issues)}")
+
+    return anonymous_user_id
+
+
+def _get_analytics_id() -> str | None:
+    result: str | bool | None = _ANALYTICS_ID
+    if result is None:
+        result = _setup_analytics()
+
+    if result is False:
+        return None
+
+    return cast(str, result)
+
+
+_ANALYTICS_ID = _get_analytics_id()
+
 
 class SyncState(str, Enum):
     STARTED = "started"
@@ -174,7 +266,7 @@ def send_telemetry(
             "https://api.segment.io/v1/track",
             auth=(PYAIRBYTE_APP_TRACKING_KEY, ""),
             json={
-                "anonymousId": "airbyte-lib-user",
+                "anonymousId": _get_analytics_id(),
                 "event": "sync",
                 "properties": payload_props,
                 "timestamp": datetime.datetime.utcnow().isoformat(),  # noqa: DTZ003

diff --git a/tests/unit_tests/test_anonymous_usage_stats.py b/tests/unit_tests/test_anonymous_usage_stats.py
@@ -4,8 +4,10 @@
 import itertools
 from contextlib import nullcontext as does_not_raise
 import json
+import os
+from pathlib import Path
 import re
-from unittest.mock import Mock, call, patch
+from unittest.mock import MagicMock, call, patch
 from freezegun import freeze_time
 
 import responses
@@ -16,8 +18,6 @@
 from airbyte.version import get_version
 import airbyte as ab
 from airbyte._util import telemetry
-import requests
-import datetime
 
 
 @responses.activate
@@ -174,3 +174,75 @@ def test_tracking(
             }
         )
     ])
+
+
+def test_setup_analytics_existing_file(monkeypatch):
+    # Mock the environment variable and the analytics file
+    monkeypatch.delenv(telemetry._ENV_ANALYTICS_ID, raising=False)
+    monkeypatch.delenv(telemetry.DO_NOT_TRACK, raising=False)
+
+    monkeypatch.setattr(Path, 'exists', lambda x: True)
+    monkeypatch.setattr(Path, 'read_text', lambda x: "anonymous_user_id: test_id\n")
+    assert telemetry._setup_analytics() == 'test_id'
+
+
+def test_setup_analytics_missing_file(monkeypatch):
+    """Mock the environment variable and the missing analytics file."""
+    monkeypatch.setenv(telemetry._ENV_ANALYTICS_ID, 'test_id')
+    monkeypatch.delenv(telemetry.DO_NOT_TRACK, raising=False)
+    monkeypatch.setattr(Path, 'exists', lambda x: False)
+
+    mock_path = MagicMock()
+    monkeypatch.setattr(Path, 'write_text', mock_path)
+
+    assert telemetry._setup_analytics() == 'test_id'
+
+    assert mock_path.call_count == 1
+
+
+def test_setup_analytics_read_only_filesystem(monkeypatch, capfd):
+    """Mock the environment variable and simulate a read-only filesystem."""
+    monkeypatch.setenv(telemetry._ENV_ANALYTICS_ID, 'test_id')
+    monkeypatch.delenv(telemetry.DO_NOT_TRACK, raising=False)
+    monkeypatch.setattr(Path, 'exists', lambda x: False)
+
+    mock_write_text = MagicMock(side_effect=PermissionError("Read-only filesystem"))
+    monkeypatch.setattr(Path, 'write_text', mock_write_text)
+
+    # We should not raise an exception
+    assert telemetry._setup_analytics() == "test_id"
+
+    assert mock_write_text.call_count == 1
+
+    # Capture print outputs
+    captured = capfd.readouterr()
+
+    # Validate print message
+    assert "Read-only filesystem" not in captured.out
+
+
+def test_setup_analytics_corrupt_file(monkeypatch):
+    """Mock the environment variable and the missing analytics file."""
+    monkeypatch.delenv(telemetry._ENV_ANALYTICS_ID, raising=False)
+    monkeypatch.delenv(telemetry.DO_NOT_TRACK, raising=False)
+    monkeypatch.setattr(Path, 'exists', lambda x: True)
+    monkeypatch.setattr(Path, 'read_text', lambda x: "not-a-valid ::: yaml file\n")
+
+    mock = MagicMock()
+    monkeypatch.setattr(Path, 'write_text', mock)
+
+    assert telemetry._setup_analytics()
+
+    assert mock.call_count == 1
+
+
+def test_get_analytics_id(monkeypatch):
+    # Mock the _ANALYTICS_ID variable
+    monkeypatch.delenv(telemetry._ENV_ANALYTICS_ID, raising=False)
+    monkeypatch.delenv(telemetry.DO_NOT_TRACK, raising=False)
+    monkeypatch.setattr(telemetry, '_ANALYTICS_ID', 'test_id')
+
+    mock = MagicMock()
+    monkeypatch.setattr(Path, 'write_text', mock)
+
+    assert telemetry._get_analytics_id() == 'test_id'