Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/infra #11

Merged
merged 35 commits into from
May 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
50336bb
change the runtime value and compatible architecture
g-lorena Apr 23, 2024
8770698
change layer name and folders, added new path to locals and modify la…
g-lorena Apr 24, 2024
6df46e8
update gitignore
g-lorena Apr 24, 2024
f193334
refactor the request_layer infrastructure with the first method
g-lorena Apr 24, 2024
f5c7867
update gitignore
g-lorena Apr 24, 2024
eb1e1be
added glue catalog, glue crawler, glue iam and glue ob modules, insta…
g-lorena Apr 24, 2024
b3d43d7
added glue-catalog-table and database, modify infrastructure of both
g-lorena Apr 24, 2024
59403a1
added pyYaml module to requirements and added scripts to delete pytho…
g-lorena Apr 25, 2024
d90e231
added Infrastructure for glue_crawler, classifier, job and iam
g-lorena Apr 26, 2024
1e22e4f
change the runtime value and compatible architecture
g-lorena Apr 23, 2024
5f626f3
change layer name and folders, added new path to locals and modify la…
g-lorena Apr 24, 2024
3bc5254
update gitignore
g-lorena Apr 24, 2024
136b923
refactor the request_layer infrastructure with the first method
g-lorena Apr 24, 2024
eee2ea4
update gitignore
g-lorena Apr 24, 2024
1b3c02b
added glue catalog, glue crawler, glue iam and glue ob modules, insta…
g-lorena Apr 24, 2024
eaef31c
added glue-catalog-table and database, modify infrastructure of both
g-lorena Apr 24, 2024
0b99d32
added pyYaml module to requirements and added scripts to delete pytho…
g-lorena Apr 25, 2024
b708d74
added Infrastructure for glue_crawler, classifier, job and iam
g-lorena Apr 26, 2024
a741c53
added cloudwatch module to schedule lambda function run, modify reque…
g-lorena Apr 28, 2024
9e1bb90
resolve confilts
g-lorena Apr 28, 2024
be70689
update extract modules
g-lorena Apr 28, 2024
918006b
change variable for lambda layer and lambda function
g-lorena Apr 28, 2024
f51c325
added s3:PutObject policy to lambda, comment unused code
g-lorena Apr 28, 2024
710a751
added comments on unused code
g-lorena Apr 28, 2024
394447c
update
g-lorena Apr 30, 2024
dd383d7
modify extract
g-lorena Apr 30, 2024
c7dc627
added glue crawler and glue trigger, changed lambda function structur…
g-lorena May 4, 2024
dae76c4
update lambda function and glue job
g-lorena May 4, 2024
e24425b
change terraform code and glue job
g-lorena May 7, 2024
6eb4843
change script of glue job script
g-lorena May 7, 2024
4d0a213
delete unused zip
g-lorena May 7, 2024
8e1a748
added new line in gitignore
g-lorena May 7, 2024
dac243f
Merge branch 'develop' of github.com:g-lorena/aws_etl_pipeline into f…
g-lorena May 7, 2024
076955f
change request_layer output
g-lorena May 7, 2024
71c6db8
modify lambda layer terrafom code
g-lorena May 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 67 additions & 19 deletions Infra/locals.tf
Original file line number Diff line number Diff line change
@@ -1,30 +1,78 @@
locals {

#buckets
lambda_layer_bucket_name = "my-lambda-layer-bucket-001"
lambda_layer = "lambda_layer"
rapid_api_host = "zillow56.p.rapidapi.com"
rapid_api_key = "c7d66d4175msh4b730460e56d07dp177281jsn66cc27e2b144"
bucket_name = "real-estate-etl-101"
raw_repertory = "raw_data"
std_repertory = "std_data"
aws_region = "eu-west-3"

# layer
layer_zip_path = "requirements.zip"
lambda_layer = "lambda_layer"
rapid_api_host = "zillow56.p.rapidapi.com"
rapid_api_key = "XXXX"
bucket_name = "real-estate-etl-101"
raw_repertory = "raw_data"
std_repertory = "std_data"
aws_region = "eu-west-3"

utils_bucket = "real-estate-etl-utils"
glue_script_key = "script/glue_etl_script.py"
glue_local_script_path = "../etl/glue_etl_job/transform_data.py"

# first method layer
layer_zip_path = "python.zip"
layer_name = "my_lambda_requirements_layer"
requirements_path = "../requirements.txt"


path_to_system_folder = "../etl/extract/System"

compatible_layer_runtimes = ["python3.10"]
compatible_architectures = ["x86_64"]

# lambda
path_to_source_file = "../etl/extract/extract_data.py"
path_to_output = "lambda_function_extract_data.zip"
function_name = "lambda_extract_fromAPI"
function_handler = "extract_data.lambda_handler"
memory_size = 512
timeout = 300
runtime = "python3.10"
path_to_source_folder = "../etl/extract"
#path_to_source_file = "../etl/extract"
path_to_output = "lambda_function_extract_data.zip"
function_name = "lambda_extract_fromAPI"
function_handler = "extract_data.lambda_handler"
memory_size = 512
timeout = 300
runtime = "python3.10"

# Glue catalog
glue_catalog_database_name = "real-estate-database"

# iam

# Glue Crawler
glue_Crawler_Name = "real_estate_crawler"
houston_crawler_name = "real_estate_houston_crawler"
panamera_crawler_name = "real_estate_panamera_crawler"
houston = "houston"
panamera = "pasadena"

# Glue Classifier
classifier_name = "real_estate_classifier"
json_path = "$[*]"

# Glue Job
glue_job_name = "real_estate_job"
glue_version = "4.0"
worker_type = "G.1X"
number_of_workers = 2
time_out = 2880
script_location = ""
class = "GlueApp"
enable-job-insights = "true"
enable-auto-scaling = "false"
enable-glue-datacatalog = "true"
job-language = "python"
job-bookmark-option = "job-bookmark-disable"
datalake-formats = "iceberg"
conf = "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog --conf spark.sql.catalog.glue_catalog.warehouse=s3://tnt-erp-sql/ --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO"

# cloudwatch
schedule_name = "schedule"
schedule_value = "cron(0 8 ? * MON-FRI *)"

# Glue Trigger
glue_trigger_name = "realestate-glue-job-trigger"
glue_trigger_schedule_type = "SCHEDULED"
glue_trigger_schedule_value = "cron(15 12 * * ? *)"

}
144 changes: 119 additions & 25 deletions Infra/main.tf
Original file line number Diff line number Diff line change
@@ -1,44 +1,138 @@


module "s3bucket"{
module "s3bucket" {
source = "./modules/s3"

bucket_name = local.bucket_name
bucket_name = local.bucket_name
raw_repertory = local.raw_repertory
std_repertory = local.std_repertory

utils_bucket_name = local.utils_bucket
glue_script_key = local.glue_script_key
glue_local_script_path = local.glue_local_script_path

}

module "lambdaLayer"{
source = "./modules/layers"
module "lambdaLayer" {
source = "./modules/request_layer"

requirements_path = local.requirements_path
layer_zip_path = local.layer_zip_path
layer_name = local.layer_name
layer_zip_path = local.layer_zip_path
layer_name = local.layer_name

path_to_system_folder = local.path_to_system_folder

lambda_layer_bucket_name = local.lambda_layer_bucket_name
lambda_layer = local.lambda_layer
lambda_layer = local.lambda_layer

#path_to_request_layer_source = local.path_to_request_layer_source
#path_to_request_layer_artifact = local.path_to_request_layer_artifact

#path_to_request_layer_filename = local.path_to_request_layer_filename
#request_layer_name = local.request_layer_name


#path_to_request_layer_source = local.path_to_request_layer_source
#path_to_request_layer_artifact = local.path_to_request_layer_artifact

#path_to_request_layer_filename = local.path_to_request_layer_filename
#request_layer_name = local.request_layer_name

compatible_layer_runtimes = local.compatible_layer_runtimes
compatible_architectures = local.compatible_architectures
compatible_architectures = local.compatible_architectures

}

module "lambdaFunction" {
source = "./modules/lambda"

path_to_source_file = local.path_to_source_file
path_to_output = local.path_to_output
function_name = local.function_name
function_handler = local.function_handler
memory_size = local.memory_size
timeout = local.timeout
runtime = local.runtime
rapid_api_host = local.rapid_api_host
rapid_api_key = local.rapid_api_key
bucket_name = local.bucket_name
raw_repertory = local.raw_repertory
lambda_layer_arns = [module.lambdaLayer.lamnda_layer_arn]
aws_region = local.aws_region
s3_bucket_arn = module.s3bucket.s3_bucket_arn

path_to_source_folder = local.path_to_source_folder
path_to_output = local.path_to_output
function_name = local.function_name
function_handler = local.function_handler
memory_size = local.memory_size
timeout = local.timeout
runtime = local.runtime
rapid_api_host = local.rapid_api_host
rapid_api_key = local.rapid_api_key
bucket_name = local.bucket_name
raw_repertory = local.raw_repertory
lambda_layer_arns = [module.lambdaLayer.lamnda_layer_arn]
aws_region = local.aws_region
s3_bucket_arn = module.s3bucket.s3_etl_bucket_arn

}

module "cloudwatch_schedule_module" {
source = "./modules/eventbridge"
schedule_name = local.schedule_name
schedule_value = local.schedule_value
aws_lambda_arn = module.lambdaFunction.lambda_function_arn
aws_lambda_function_name = module.lambdaFunction.lambda_function_name
}

module "glueCatalogDatabase" {
source = "./modules/glue_catalog_database"

glue_catalog_database_name = local.glue_catalog_database_name
}

module "glueIamRole" {
source = "./modules/glue_iam"

}

module "glueClassifier" {
source = "./modules/glue_classifier"
classifier_name = local.classifier_name
json_path = local.json_path

}

module "glueCrawler" {
source = "./modules/glue_crawler"

database = module.glueCatalogDatabase.database_name
houston_crawler_name = local.houston_crawler_name
panamera_crawler_name = local.panamera_crawler_name

houston = local.houston
panamera = local.panamera

#name = local.glue_Crawler_Name
glue_iam_role = module.glueIamRole.glue_iam_arn

classifiers = [module.glueClassifier.aws_glue_classifier_id]
s3_target_path_panamera = module.s3bucket.aws_s3_bucket_uri
s3_target_path_houston = module.s3bucket.aws_s3_bucket_uri
#s3_target_path = module.s3bucket.aws_s3_bucket_uri
}

module "glueJob" {
source = "./modules/glue_job"

name = local.glue_job_name
iam_glue_arn = module.glueIamRole.glue_iam_arn
glue_version = local.glue_version
#worker_type = local.worker_type
script_location = module.s3bucket.aws_s3_bucket_glue_script_uri
timeout = local.time_out
class = local.class
enable-job-insights = local.enable-job-insights
enable-auto-scaling = local.enable-auto-scaling
enable-glue-datacatalog = local.enable-glue-datacatalog
job-language = local.job-language
job-bookmark-option = local.job-bookmark-option
datalake-formats = local.datalake-formats
conf = local.conf

}

module "glueTrigger" {
source = "./modules/glue_trigger"

name = local.glue_trigger_name
schedule_type = local.glue_trigger_schedule_type
schedule_value = local.schedule_value
job_name = module.glueJob.aws_glue_job_name
}


Expand Down
18 changes: 18 additions & 0 deletions Infra/modules/eventbridge/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
resource "aws_cloudwatch_event_rule" "schedule" {
name = var.schedule_name #"schedule"
description = "Schedule for Lambda Function"
schedule_expression = var.schedule_value
}

resource "aws_cloudwatch_event_target" "schedule_lambda" {
rule = aws_cloudwatch_event_rule.schedule.name
target_id = "processing_lambda"
arn = var.aws_lambda_arn
}

resource "aws_lambda_permission" "allow_events_bridge_to_run_lambda" {
statement_id = "AllowExecutionFromCloudWatch"
action = "lambda:InvokeFunction"
function_name = var.aws_lambda_function_name
principal = "events.amazonaws.com"
}
File renamed without changes.
23 changes: 23 additions & 0 deletions Infra/modules/eventbridge/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
variable "schedule_value" {
description = "schedule"
type = string
}

variable "aws_lambda_arn" {
description = "variable arn"
type = string
}

variable "aws_lambda_function_name" {
description = "variable function namearn"
type = string
}

variable "schedule_name" {
description = "schedule name"
type = string
}




3 changes: 3 additions & 0 deletions Infra/modules/glue_catalog_database/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
resource "aws_glue_catalog_database" "aws_glue_catalog_database" {
name = var.glue_catalog_database_name
}
3 changes: 3 additions & 0 deletions Infra/modules/glue_catalog_database/output.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
output "database_name" {
value = aws_glue_catalog_database.aws_glue_catalog_database.name
}
4 changes: 4 additions & 0 deletions Infra/modules/glue_catalog_database/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
variable "glue_catalog_database_name" {
description = "principal bucket name"
type = string
}
4 changes: 4 additions & 0 deletions Infra/modules/glue_catalog_table/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
resource "aws_glue_catalog_table" "glue_catalog_table" {
name = var.table-name
database_name = var.database_name
}
7 changes: 7 additions & 0 deletions Infra/modules/glue_catalog_table/output.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
output "name" {
value = aws_glue_catalog_table.glue_catalog_table.name
}

output "arn" {
value = aws_glue_catalog_table.glue_catalog_table.arn
}
9 changes: 9 additions & 0 deletions Infra/modules/glue_catalog_table/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
variable "table_name" {
description = "glue catalog table name"
type = string
}

variable "database_name" {
description = "glue catalog database name"
type = string
}
7 changes: 7 additions & 0 deletions Infra/modules/glue_classifier/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
resource "aws_glue_classifier" "crawler_classifier" {
name = var.classifier_name

json_classifier {
json_path = var.json_path
}
}
4 changes: 4 additions & 0 deletions Infra/modules/glue_classifier/output.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
output "aws_glue_classifier_id" {
value = aws_glue_classifier.crawler_classifier.id
#description = "The name of the Glue ETL Job"
}
9 changes: 9 additions & 0 deletions Infra/modules/glue_classifier/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
variable "classifier_name" {
description = "classifier name"
type = string
}

variable "json_path" {
description = "json path"
type = string
}
Loading
Loading