Skip to content

Commit 9dc08c1

Browse files
committed
Mapper FaaS
1 parent b05a0b6 commit 9dc08c1

File tree

3 files changed

+92
-0
lines changed

3 files changed

+92
-0
lines changed

mapper/main.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import functions_framework
2+
import os
3+
import re
4+
import json
5+
import hashlib
6+
import csv
7+
from google.cloud import storage
8+
9+
# mapper code
10+
def create_folder_and_upload_file(bucket, folder_name, file_name, local_file_path):
11+
# Check if the folder exists
12+
folder_path = f"{folder_name}/"
13+
folder_blob = bucket.blob(folder_path)
14+
15+
if not folder_blob.exists():
16+
# If the folder doesn't exist, create it
17+
bucket.blob(folder_path).upload_from_string('') # Creating an empty file as a marker for the folder
18+
19+
# Upload the file to the specified folder
20+
file_path = f"{folder_name}/{file_name}"
21+
blob = bucket.blob(file_path)
22+
blob.upload_from_filename(local_file_path)
23+
24+
print(f"File '{file_name}' uploaded to folder '{folder_name}' in bucket.")
25+
26+
27+
28+
@functions_framework.http
29+
def mapper(request):
30+
request_json = request.get_json(silent=True)
31+
32+
# mapper code
33+
client = storage.Client.from_service_account_json('piyush-chaudhari-fall2023-9ae1ed20a7f3.json')
34+
# gcs_bucket_name = 'eccrm_dataset_bucket'
35+
# bucket = client.get_bucket(gcs_bucket_name)
36+
# folder_name = 'dataset'
37+
38+
tuples_list = []
39+
mapper_name = request_json['mapper_name']
40+
file_list = request_json['file_list']
41+
42+
# CSV file path
43+
csv_file_path = f'{mapper_name}.csv'
44+
45+
for filename in file_list:
46+
bucket_name = filename[0]
47+
folder_name = filename[1]
48+
filename = filename[2]
49+
file_path = f"{folder_name}/{filename}"
50+
bucket = client.get_bucket(bucket_name)
51+
blob = bucket.blob(file_path)
52+
content = blob.download_as_text()
53+
54+
# Remove extra spaces and new lines, convert to lowercase
55+
# processed_content = re.sub(r'\s+', ' ', content).lower()
56+
processed_content = re.sub(r'\s+', ' ', content).lower()
57+
processed_content = re.sub(r'[^a-zA-Z0-9\s]', '', processed_content)
58+
59+
# Extract words
60+
words = processed_content.split()
61+
62+
# Create list of tuples
63+
temp_tuples_list = [(word, os.path.basename(file_path), 1) for word in words]
64+
tuples_list = tuples_list + temp_tuples_list
65+
66+
# Writing the list of tuples to a CSV file
67+
with open(csv_file_path, 'w', newline='') as csv_file:
68+
csv_writer = csv.writer(csv_file)
69+
csv_writer.writerow(['Word', 'Document', 'Count']) # Writing header row
70+
csv_writer.writerows(tuples_list) # Writing data rows
71+
72+
mapper_bucket_name = "mapper_bucket"
73+
mapper_bucket = client.get_bucket(mapper_bucket_name)
74+
create_folder_and_upload_file(mapper_bucket, mapper_name, csv_file_path, csv_file_path)
75+
print(f"{mapper_name} OK")
76+
return f"{mapper_name} OK"
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"type": "service_account",
3+
"project_id": "piyush-chaudhari-fall2023",
4+
"private_key_id": "",
5+
"private_key": "",
6+
"client_email": "googlecloudstorage@piyush-chaudhari-fall2023.iam.gserviceaccount.com",
7+
"client_id": "",
8+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
9+
"token_uri": "https://oauth2.googleapis.com/token",
10+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
11+
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/googlecloudstorage%40piyush-chaudhari-fall2023.iam.gserviceaccount.com",
12+
"universe_domain": "googleapis.com"
13+
}

mapper/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
functions-framework==3.*
2+
google-cloud-storage==2.13.0
3+
six==1.16.0

0 commit comments

Comments
 (0)