|
| 1 | +import functions_framework |
| 2 | +import os |
| 3 | +import re |
| 4 | +import json |
| 5 | +import hashlib |
| 6 | +import csv |
| 7 | +from google.cloud import storage |
| 8 | + |
| 9 | +# mapper code |
| 10 | +def create_folder_and_upload_file(bucket, folder_name, file_name, local_file_path): |
| 11 | + # Check if the folder exists |
| 12 | + folder_path = f"{folder_name}/" |
| 13 | + folder_blob = bucket.blob(folder_path) |
| 14 | + |
| 15 | + if not folder_blob.exists(): |
| 16 | + # If the folder doesn't exist, create it |
| 17 | + bucket.blob(folder_path).upload_from_string('') # Creating an empty file as a marker for the folder |
| 18 | + |
| 19 | + # Upload the file to the specified folder |
| 20 | + file_path = f"{folder_name}/{file_name}" |
| 21 | + blob = bucket.blob(file_path) |
| 22 | + blob.upload_from_filename(local_file_path) |
| 23 | + |
| 24 | + print(f"File '{file_name}' uploaded to folder '{folder_name}' in bucket.") |
| 25 | + |
| 26 | + |
| 27 | + |
| 28 | +@functions_framework.http |
| 29 | +def mapper(request): |
| 30 | + request_json = request.get_json(silent=True) |
| 31 | + |
| 32 | + # mapper code |
| 33 | + client = storage.Client.from_service_account_json('piyush-chaudhari-fall2023-9ae1ed20a7f3.json') |
| 34 | + # gcs_bucket_name = 'eccrm_dataset_bucket' |
| 35 | + # bucket = client.get_bucket(gcs_bucket_name) |
| 36 | + # folder_name = 'dataset' |
| 37 | + |
| 38 | + tuples_list = [] |
| 39 | + mapper_name = request_json['mapper_name'] |
| 40 | + file_list = request_json['file_list'] |
| 41 | + |
| 42 | + # CSV file path |
| 43 | + csv_file_path = f'{mapper_name}.csv' |
| 44 | + |
| 45 | + for filename in file_list: |
| 46 | + bucket_name = filename[0] |
| 47 | + folder_name = filename[1] |
| 48 | + filename = filename[2] |
| 49 | + file_path = f"{folder_name}/{filename}" |
| 50 | + bucket = client.get_bucket(bucket_name) |
| 51 | + blob = bucket.blob(file_path) |
| 52 | + content = blob.download_as_text() |
| 53 | + |
| 54 | + # Remove extra spaces and new lines, convert to lowercase |
| 55 | + # processed_content = re.sub(r'\s+', ' ', content).lower() |
| 56 | + processed_content = re.sub(r'\s+', ' ', content).lower() |
| 57 | + processed_content = re.sub(r'[^a-zA-Z0-9\s]', '', processed_content) |
| 58 | + |
| 59 | + # Extract words |
| 60 | + words = processed_content.split() |
| 61 | + |
| 62 | + # Create list of tuples |
| 63 | + temp_tuples_list = [(word, os.path.basename(file_path), 1) for word in words] |
| 64 | + tuples_list = tuples_list + temp_tuples_list |
| 65 | + |
| 66 | + # Writing the list of tuples to a CSV file |
| 67 | + with open(csv_file_path, 'w', newline='') as csv_file: |
| 68 | + csv_writer = csv.writer(csv_file) |
| 69 | + csv_writer.writerow(['Word', 'Document', 'Count']) # Writing header row |
| 70 | + csv_writer.writerows(tuples_list) # Writing data rows |
| 71 | + |
| 72 | + mapper_bucket_name = "mapper_bucket" |
| 73 | + mapper_bucket = client.get_bucket(mapper_bucket_name) |
| 74 | + create_folder_and_upload_file(mapper_bucket, mapper_name, csv_file_path, csv_file_path) |
| 75 | + print(f"{mapper_name} OK") |
| 76 | + return f"{mapper_name} OK" |
0 commit comments