Selenium in Airflow #43574
-
Hi Everyone, I tries to create a simple dag, and following task
but I facing issue in pushing and pull xcoms. Caution Error creating driver: Can't pickle local object '_createenviron..encode' as per stackoverflow 👇 I modify xcom_backend=airflow.models.xcom.BaseXCom
enable_xcom_pickling = True from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators.python import PythonOperator
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import os
# Constants
DOWNLOAD_PATH = os.path.expanduser('~/Downloads') # Adjust as needed
AMAZON_URL = "https://www.amazon.com"
def create_chrome_driver(**context):
"""Create and configure Chrome WebDriver"""
print(context)
try:
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
# chrome_options.add_argument('--headless') # Run in headless mode
chrome_options.add_argument('--disable-dev-shm-usage')
# Configure download settings
prefs = {
"download.default_directory": DOWNLOAD_PATH,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True
}
chrome_options.add_experimental_option("prefs", prefs)
# Initialize driver
# service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome( options=chrome_options) #service=service,
driver.implicitly_wait(10)
# Store driver in XCom for next task
context['task_instance'].xcom_push(key='webdriver', value=driver)
return "Driver created successfully"
except Exception as e:
print(f"Error creating driver: {str(e)}")
raise
def navigate_to_amazon(**context):
"""Navigate to Amazon using the created driver"""
try:
# Get driver from previous task
driver = context['task_instance'].xcom_pull(task_ids='create_driver', key='webdriver')
print("driver init successfully")
# Navigate to Amazon
driver.get(AMAZON_URL)
# Verify we're on Amazon (basic check)
assert "Amazon" in driver.title, "Failed to navigate to Amazon"
return f"Successfully navigated to {AMAZON_URL}"
except Exception as e:
print(f"Error navigating to Amazon: {str(e)}")
raise
finally:
if driver:
driver.quit()
# DAG definition
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2024, 1, 1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
}
with DAG(
'amazon_driver_test',
default_args=default_args,
description='Test DAG for creating WebDriver and navigating to Amazon',
schedule_interval=None, # Manual trigger only
catchup=False
) as dag:
create_driver = PythonOperator(
task_id='create_driver',
python_callable=create_chrome_driver,
provide_context=True,
)
goto_amazon = PythonOperator(
task_id='navigate_to_amazon',
python_callable=navigate_to_amazon,
provide_context=True,
)
# Set task dependencies
create_driver >> goto_amazon I tried to achieve following |
Beta Was this translation helpful? Give feedback.
Replies: 3 comments
-
can't we do serialize the webdriver.Chrome variable? |
Beta Was this translation helpful? Give feedback.
-
Not sure if that's a good idea to have airflow. tasks as single action in selenium. In Airflow every task is executed in a separate process, potentially on different machines and AFAIK selenium requires to establish connection and run in the same process. That's why you see serialization issue because Airflow tries to serialize driver connected to your webserver and then it would have to restore it in another process. I simply think it's a bad idea to use it for Airflow. |
Beta Was this translation helpful? Give feedback.
-
Thanks for your info @potiuk |
Beta Was this translation helpful? Give feedback.
Not sure if that's a good idea to have airflow. tasks as single action in selenium. In Airflow every task is executed in a separate process, potentially on different machines and AFAIK selenium requires to establish connection and run in the same process. That's why you see serialization issue because Airflow tries to serialize driver connected to your webserver and then it would have to restore it in another process.
I simply think it's a bad idea to use it for Airflow.