Skip to content

Commit

Permalink
Improved scene matching with parallelism + more
Browse files Browse the repository at this point in the history
  • Loading branch information
MinasukiHikimuna committed Aug 28, 2024
1 parent 4b9e373 commit ecf12f3
Showing 1 changed file with 82 additions and 40 deletions.
122 changes: 82 additions & 40 deletions pandas/scene_matching.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,20 @@
" return True # If stash_ids is not a list, assume endpoint is missing\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"missing_from_stashbox_tag = stash.find_tags({\n",
" \"name\": {\n",
" \"modifier\": \"EQUALS\",\n",
" \"value\": \"Missing From Stashbox\"\n",
" }\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -65,6 +79,12 @@
" \"stash_id_endpoint\": {\n",
" \"endpoint\": \"https://stashdb.org/graphql\",\n",
" \"modifier\": \"NOT_NULL\"\n",
" },\n",
" \"tags\": {\n",
" \"depth\": -1,\n",
" \"excludes\": [missing_from_stashbox_tag[0][\"id\"]],\n",
" \"value\": [],\n",
" \"modifier\": \"INCLUDES_ALL\"\n",
" }\n",
" },\n",
" fragment=\"\"\"\n",
Expand Down Expand Up @@ -113,12 +133,27 @@
"outputs": [],
"source": [
"# Run this cell to find scenes of a single studio\n",
"studio_name = \"Meana Wolf\"\n",
"studios = stash.find_studios({ \n",
"# studio_name = \"Whipped Ass\"\n",
"# studios = stash.find_studios({ \n",
"# \"name\": {\n",
"# \"modifier\": \"EQUALS\",\n",
"# \"value\": studio_name\n",
"# }\n",
"# })\n",
"\n",
"parent_studio_name = \"Brazzers\"\n",
"parent_studios = stash.find_studios({\n",
" \"name\": {\n",
" \"modifier\": \"EQUALS\",\n",
" \"value\": studio_name\n",
" } \n",
" \"value\": parent_studio_name\n",
" }\n",
"})\n",
"\n",
"studios = stash.find_studios({\n",
" \"parents\": {\n",
" \"value\": [parent_studios[0][\"id\"]],\n",
" \"modifier\": \"INCLUDES\"\n",
" }\n",
"})\n",
"\n",
"studio_ids = [studio[\"id\"] for studio in studios]\n",
Expand Down Expand Up @@ -222,44 +257,52 @@
"\n",
"import pandas as pd\n",
"import requests\n",
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
"\n",
"# Function to fetch scene data for a single row\n",
"def fetch_single_scene_data(row, headers):\n",
" id = row[\"id\"]\n",
" phash = row[\"phash\"]\n",
"\n",
" response = requests.get(\n",
" f\"https://api.theporndb.net/scenes?hash={phash}&hashType=PHASH\",\n",
" headers=headers,\n",
" )\n",
"\n",
" if response.status_code == 200:\n",
" response_json = response.json()\n",
"\n",
" if response_json:\n",
" scenes_data = response_json.get(\"data\", [])\n",
" if scenes_data:\n",
" scene_data_list = []\n",
" for scene_data in scenes_data:\n",
" scene_data_list.append({\n",
" \"tpdb_date\": scene_data.get(\"date\"),\n",
" \"tpdb_title\": scene_data.get(\"title\"),\n",
" \"tpdb_uuid\": scene_data.get(\"id\"),\n",
" \"tpdb_url\": scene_data.get(\"url\"),\n",
" \"tpdb_studio_id\": scene_data.get(\"site\", {}).get(\"uuid\"),\n",
" \"tpdb_studio_name\": scene_data.get(\"site\", {}).get(\"name\"),\n",
" \"tpdb_studio_url\": scene_data.get(\"site\", {}).get(\"url\"),\n",
" })\n",
" return {\n",
" \"id\": id,\n",
" \"stash_scene\": row.to_dict(), # Store the whole row as a dict\n",
" \"matches\": scene_data_list\n",
" }\n",
" return None\n",
"\n",
"# Function to fetch scene data by phash and store directly in a list\n",
"def fetch_scene_data(df, headers):\n",
" # List to store the scene data for DataFrame creation\n",
" scenes_data_list = []\n",
"\n",
" # Loop through data frame rows\n",
" for index, row in df.iterrows():\n",
" id = row[\"id\"]\n",
" phash = row[\"phash\"]\n",
"\n",
" response = requests.get(\n",
" f\"https://api.theporndb.net/scenes?hash={phash}&hashType=PHASH\",\n",
" headers=headers,\n",
" )\n",
"\n",
" if response.status_code == 200:\n",
" response_json = response.json()\n",
"\n",
" if response_json:\n",
" scenes_data = response_json.get(\"data\", [])\n",
" if scenes_data:\n",
" scene_data_list = []\n",
" for scene_data in scenes_data:\n",
" scene_data_list.append({\n",
" \"tpdb_date\": scene_data.get(\"date\"),\n",
" \"tpdb_title\": scene_data.get(\"title\"),\n",
" \"tpdb_uuid\": scene_data.get(\"id\"),\n",
" \"tpdb_url\": scene_data.get(\"url\"),\n",
" \"tpdb_studio_id\": scene_data.get(\"site\", {}).get(\"uuid\"),\n",
" \"tpdb_studio_name\": scene_data.get(\"site\", {}).get(\"name\"),\n",
" \"tpdb_studio_url\": scene_data.get(\"site\", {}).get(\"url\"),\n",
" })\n",
" scenes_data_list.append({\n",
" \"id\": id,\n",
" \"stash_scene\": row.to_dict(), # Store the whole row as a dict\n",
" \"matches\": scene_data_list\n",
" })\n",
" with ThreadPoolExecutor(max_workers=10) as executor:\n",
" future_to_row = {executor.submit(fetch_single_scene_data, row, headers): row for _, row in df.iterrows()}\n",
" for future in as_completed(future_to_row):\n",
" result = future.result()\n",
" if result:\n",
" scenes_data_list.append(result)\n",
"\n",
" # Create a DataFrame from the list of scene data\n",
" return pd.DataFrame(scenes_data_list)\n",
Expand Down Expand Up @@ -498,8 +541,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Run this cell to save TPDB UUID for those where all identifiers match\n",
"save_tpdb_uuid(df_matched_scenes[df_matched_scenes[\"all_match\"]])"
"save_tpdb_uuid(df_matched_scenes[df_matched_scenes[\"date_match\"] & df_matched_scenes[\"exact_title_match\"]])\n"
]
},
{
Expand All @@ -510,7 +552,7 @@
"source": [
"# Example: Run this cell to save TPDB UUID for those where all date and studio name match and title is a near match.\n",
"# Tweak the conditions as needed.\n",
"df_partially_matched = df_matched_scenes[df_matched_scenes[\"date_match\"] & df_matched_scenes[\"near_title_match\"] & df_matched_scenes[\"studio_match\"]]\n",
"df_partially_matched = df_matched_scenes[df_matched_scenes[\"date_match\"] & df_matched_scenes[\"near_title_match\"] & df_matched_scenes[\"studio_match\"] & df_matched_scenes[\"url_match\"]]\n",
"save_tpdb_uuid(df_partially_matched)"
]
},
Expand Down

0 comments on commit ecf12f3

Please sign in to comment.