Improved scene matching with parallelism + more

MinasukiHikimuna · Aug 28, 2024 · ecf12f3 · ecf12f3
1 parent 4b9e373
commit ecf12f3
Showing 1 changed file with 82 additions and 40 deletions.
diff --git a/pandas/scene_matching.ipynb b/pandas/scene_matching.ipynb
@@ -53,6 +53,20 @@
     "    return True  # If stash_ids is not a list, assume endpoint is missing\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "missing_from_stashbox_tag = stash.find_tags({\n",
+    "    \"name\": {\n",
+    "        \"modifier\": \"EQUALS\",\n",
+    "        \"value\": \"Missing From Stashbox\"\n",
+    "    }\n",
+    "})"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -65,6 +79,12 @@
     "        \"stash_id_endpoint\": {\n",
     "            \"endpoint\": \"https://stashdb.org/graphql\",\n",
     "            \"modifier\": \"NOT_NULL\"\n",
+    "        },\n",
+    "        \"tags\": {\n",
+    "            \"depth\": -1,\n",
+    "            \"excludes\": [missing_from_stashbox_tag[0][\"id\"]],\n",
+    "            \"value\": [],\n",
+    "            \"modifier\": \"INCLUDES_ALL\"\n",
     "        }\n",
     "    },\n",
     "    fragment=\"\"\"\n",
@@ -113,12 +133,27 @@
    "outputs": [],
    "source": [
     "# Run this cell to find scenes of a single studio\n",
-    "studio_name = \"Meana Wolf\"\n",
-    "studios = stash.find_studios({ \n",
+    "# studio_name = \"Whipped Ass\"\n",
+    "# studios = stash.find_studios({ \n",
+    "#     \"name\": {\n",
+    "#         \"modifier\": \"EQUALS\",\n",
+    "#         \"value\": studio_name\n",
+    "#     }\n",
+    "# })\n",
+    "\n",
+    "parent_studio_name = \"Brazzers\"\n",
+    "parent_studios = stash.find_studios({\n",
     "    \"name\": {\n",
     "        \"modifier\": \"EQUALS\",\n",
-    "        \"value\": studio_name\n",
-    "    }   \n",
+    "        \"value\": parent_studio_name\n",
+    "    }\n",
+    "})\n",
+    "\n",
+    "studios = stash.find_studios({\n",
+    "    \"parents\": {\n",
+    "        \"value\": [parent_studios[0][\"id\"]],\n",
+    "        \"modifier\": \"INCLUDES\"\n",
+    "    }\n",
     "})\n",
     "\n",
     "studio_ids = [studio[\"id\"] for studio in studios]\n",
@@ -222,44 +257,52 @@
     "\n",
     "import pandas as pd\n",
     "import requests\n",
+    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
+    "\n",
+    "# Function to fetch scene data for a single row\n",
+    "def fetch_single_scene_data(row, headers):\n",
+    "    id = row[\"id\"]\n",
+    "    phash = row[\"phash\"]\n",
+    "\n",
+    "    response = requests.get(\n",
+    "        f\"https://api.theporndb.net/scenes?hash={phash}&hashType=PHASH\",\n",
+    "        headers=headers,\n",
+    "    )\n",
+    "\n",
+    "    if response.status_code == 200:\n",
+    "        response_json = response.json()\n",
+    "\n",
+    "        if response_json:\n",
+    "            scenes_data = response_json.get(\"data\", [])\n",
+    "            if scenes_data:\n",
+    "                scene_data_list = []\n",
+    "                for scene_data in scenes_data:\n",
+    "                    scene_data_list.append({\n",
+    "                        \"tpdb_date\": scene_data.get(\"date\"),\n",
+    "                        \"tpdb_title\": scene_data.get(\"title\"),\n",
+    "                        \"tpdb_uuid\": scene_data.get(\"id\"),\n",
+    "                        \"tpdb_url\": scene_data.get(\"url\"),\n",
+    "                        \"tpdb_studio_id\": scene_data.get(\"site\", {}).get(\"uuid\"),\n",
+    "                        \"tpdb_studio_name\": scene_data.get(\"site\", {}).get(\"name\"),\n",
+    "                        \"tpdb_studio_url\": scene_data.get(\"site\", {}).get(\"url\"),\n",
+    "                    })\n",
+    "                return {\n",
+    "                    \"id\": id,\n",
+    "                    \"stash_scene\": row.to_dict(),  # Store the whole row as a dict\n",
+    "                    \"matches\": scene_data_list\n",
+    "                }\n",
+    "    return None\n",
     "\n",
     "# Function to fetch scene data by phash and store directly in a list\n",
     "def fetch_scene_data(df, headers):\n",
-    "    # List to store the scene data for DataFrame creation\n",
     "    scenes_data_list = []\n",
     "\n",
-    "    # Loop through data frame rows\n",
-    "    for index, row in df.iterrows():\n",
-    "        id = row[\"id\"]\n",
-    "        phash = row[\"phash\"]\n",
-    "\n",
-    "        response = requests.get(\n",
-    "            f\"https://api.theporndb.net/scenes?hash={phash}&hashType=PHASH\",\n",
-    "            headers=headers,\n",
-    "        )\n",
-    "\n",
-    "        if response.status_code == 200:\n",
-    "            response_json = response.json()\n",
-    "\n",
-    "            if response_json:\n",
-    "                scenes_data = response_json.get(\"data\", [])\n",
-    "                if scenes_data:\n",
-    "                    scene_data_list = []\n",
-    "                    for scene_data in scenes_data:\n",
-    "                        scene_data_list.append({\n",
-    "                            \"tpdb_date\": scene_data.get(\"date\"),\n",
-    "                            \"tpdb_title\": scene_data.get(\"title\"),\n",
-    "                            \"tpdb_uuid\": scene_data.get(\"id\"),\n",
-    "                            \"tpdb_url\": scene_data.get(\"url\"),\n",
-    "                            \"tpdb_studio_id\": scene_data.get(\"site\", {}).get(\"uuid\"),\n",
-    "                            \"tpdb_studio_name\": scene_data.get(\"site\", {}).get(\"name\"),\n",
-    "                            \"tpdb_studio_url\": scene_data.get(\"site\", {}).get(\"url\"),\n",
-    "                        })\n",
-    "                    scenes_data_list.append({\n",
-    "                        \"id\": id,\n",
-    "                        \"stash_scene\": row.to_dict(),  # Store the whole row as a dict\n",
-    "                        \"matches\": scene_data_list\n",
-    "                    })\n",
+    "    with ThreadPoolExecutor(max_workers=10) as executor:\n",
+    "        future_to_row = {executor.submit(fetch_single_scene_data, row, headers): row for _, row in df.iterrows()}\n",
+    "        for future in as_completed(future_to_row):\n",
+    "            result = future.result()\n",
+    "            if result:\n",
+    "                scenes_data_list.append(result)\n",
     "\n",
     "    # Create a DataFrame from the list of scene data\n",
     "    return pd.DataFrame(scenes_data_list)\n",
@@ -498,8 +541,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Run this cell to save TPDB UUID for those where all identifiers match\n",
-    "save_tpdb_uuid(df_matched_scenes[df_matched_scenes[\"all_match\"]])"
+    "save_tpdb_uuid(df_matched_scenes[df_matched_scenes[\"date_match\"] & df_matched_scenes[\"exact_title_match\"]])\n"
    ]
   },
   {
@@ -510,7 +552,7 @@
    "source": [
     "# Example: Run this cell to save TPDB UUID for those where all date and studio name match and title is a near match.\n",
     "# Tweak the conditions as needed.\n",
-    "df_partially_matched = df_matched_scenes[df_matched_scenes[\"date_match\"] & df_matched_scenes[\"near_title_match\"] & df_matched_scenes[\"studio_match\"]]\n",
+    "df_partially_matched = df_matched_scenes[df_matched_scenes[\"date_match\"] & df_matched_scenes[\"near_title_match\"] & df_matched_scenes[\"studio_match\"] & df_matched_scenes[\"url_match\"]]\n",
     "save_tpdb_uuid(df_partially_matched)"
    ]
   },