Skip to content

Commit ce08b9c

Browse files
authored
Merge pull request #2715 from jeff1evesque/feature-5
#5: Create backend logic to parse contents of URL
2 parents e0a62f1 + af989c2 commit ce08b9c

27 files changed

+795
-22
lines changed

brain/session/data/dataset_to_dict.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def dataset_dictionary(id_entity, model_type, upload):
2626
dataset = []
2727
observation_labels = []
2828
list_error = []
29-
json_upload = upload['dataset']['json_string']
29+
json_upload = upload['dataset'].get('json_string', None)
3030
list_model_type = current_app.config.get('MODEL_TYPE')
3131

3232
if json_upload:

brain/session/data/validate_file_extension.py

Lines changed: 89 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,15 @@ def reduce_dataset(dataset, session_type):
1616
# variables
1717
list_error = []
1818

19-
# web-interface: validate, and restructure dataset
20-
if dataset['data']['dataset']['file_upload']:
19+
# web-interface: validate, and restructure 'file-upload' dataset
20+
if (
21+
dataset['data'].get('dataset', None) and
22+
dataset['data']['dataset'].get('file_upload', None) and
23+
dataset['data']['settings'].get(
24+
'dataset_type', None) == 'file_upload'
25+
):
26+
27+
# validate and restructure
2128
validator = Validate_File_Extension(
2229
dataset,
2330
session_type
@@ -29,13 +36,91 @@ def reduce_dataset(dataset, session_type):
2936
adjusted_dataset['error']
3037
)
3138

32-
# programmatic-interface: validate, do not restructure
33-
elif dataset['data']['dataset']['json_string']:
39+
# web-interface: validate, and restructure url dataset
40+
elif (
41+
dataset['data']['settings'].get('dataset[]', None) and
42+
dataset['data']['settings'].get(
43+
'dataset_type', None) == 'dataset_url'
44+
):
45+
46+
# define 'file_upload' since doesn't exist
47+
data = dataset['data']
48+
data['dataset'] = {}
49+
if type(data['settings']['dataset[]']) is list:
50+
data['dataset']['file_upload'] = data['settings']['dataset[]']
51+
else:
52+
data['dataset']['file_upload'] = []
53+
data['dataset']['file_upload'].append(
54+
data['settings']['dataset[]']
55+
)
56+
57+
# validate and restructure
58+
validator = Validate_File_Extension(
59+
{
60+
'data': {
61+
'dataset': {
62+
'file_upload': data['dataset']['file_upload'],
63+
'type': data['settings']['dataset_type'],
64+
}
65+
},
66+
},
67+
session_type
68+
)
69+
adjusted_dataset = validator.validate()
70+
71+
if adjusted_dataset['error']:
72+
list_error.append(
73+
adjusted_dataset['error']
74+
)
75+
76+
# programmatic-interface: validate, do not restructure file upload
77+
elif (
78+
dataset['data']['dataset'].get('json_string', None) and
79+
dataset['data']['settings'].get(
80+
'dataset_type', None) == 'file_upload'
81+
):
82+
3483
adjusted_dataset = dataset['data']
3584

3685
if dataset['error']:
3786
list_error.append(adjusted_dataset['error'])
3887

88+
# programmatic-interface: validate, and restructure url dataset
89+
elif (
90+
dataset['data']['dataset'].get('json_string', None) and
91+
dataset['data']['settings'].get(
92+
'dataset_type', None) == 'dataset_url'
93+
):
94+
95+
# define 'file_upload' since doesn't exist
96+
data = dataset['data']
97+
if type(data['dataset']['json_string']) is list:
98+
data['dataset']['file_upload'] = data['dataset']['json_string']
99+
else:
100+
data['dataset']['file_upload'] = []
101+
data['dataset']['file_upload'].append(
102+
data['dataset']['json_string']
103+
)
104+
105+
# validate and restructure
106+
validator = Validate_File_Extension(
107+
{
108+
'data': {
109+
'dataset': {
110+
'file_upload': data['dataset']['file_upload'],
111+
'type': data['settings']['dataset_type'],
112+
}
113+
},
114+
},
115+
session_type
116+
)
117+
adjusted_dataset = validator.validate()
118+
119+
if adjusted_dataset['error']:
120+
list_error.append(
121+
adjusted_dataset['error']
122+
)
123+
39124
# return
40125
if list_error:
41126
return {'dataset': None, 'error': list_error}

brain/validator/validate_file_extension.py

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
'''
1010

1111
import os.path
12+
import urllib
13+
import cStringIO
1214
from brain.converter.calculate_md5 import calculate_md5
1315

1416

@@ -45,22 +47,30 @@ def validate(self):
4547

4648
# local variables
4749
list_error = []
48-
49-
dataset = self.premodel_data['data']['dataset']
5050
acceptable_type = ['csv', 'xml', 'json']
5151

52-
unique_hash = set()
52+
unique_data = set()
5353
dataset_keep = []
5454

55-
if (dataset.get('file_upload', None)):
55+
# validate and restructure: file upload
56+
if (
57+
self.premodel_data['data'].get('settings', None) and
58+
self.premodel_data['data']['settings'].get(
59+
'dataset_type', None) == 'file_upload' and
60+
self.premodel_data.get('data', None) and
61+
self.premodel_data['data'].get('dataset', None) and
62+
self.premodel_data['data']['dataset'].get('file_upload', None)
63+
):
64+
65+
dataset = self.premodel_data['data']['dataset']
5666

5767
for index, filedata in enumerate(dataset['file_upload']):
5868
try:
5969
split_path = os.path.splitext(filedata['filename'])
6070
filehash = calculate_md5(filedata['file'])
6171
# add 'hashed' value of file reference(s) to a list
62-
if filehash not in unique_hash:
63-
unique_hash.add(filehash)
72+
if filehash not in unique_data:
73+
unique_data.add(filehash)
6474
file_extension = split_path[1][1:].strip().lower()
6575

6676
# validate file_extension
@@ -86,6 +96,53 @@ def validate(self):
8696
# replace portion of dataset with unique 'file reference(s)'
8797
dataset['file_upload'][:] = dataset_keep
8898

99+
# validate and restructure: url reference
100+
elif (
101+
self.premodel_data.get('data', None) and
102+
self.premodel_data['data'].get('dataset', None) and
103+
self.premodel_data['data']['dataset'].get(
104+
'type', None) and
105+
self.premodel_data['data']['dataset']['type'] == 'dataset_url'
106+
):
107+
108+
dataset = self.premodel_data['data']['dataset']
109+
urls = self.premodel_data['data']['dataset']['file_upload']
110+
111+
for index, url in enumerate(urls):
112+
split_path = os.path.splitext(url)
113+
file_extension = split_path[1][1:].strip().lower()
114+
115+
try:
116+
if url not in unique_data:
117+
unique_data.add(url)
118+
119+
# validate file_extension
120+
if (file_extension not in acceptable_type):
121+
msg = '''Problem: url reference, \''''
122+
msg += file_extension
123+
msg += '''\', must be one of the formats:'''
124+
msg += '\n ' + ', '.join(acceptable_type)
125+
list_error.append(msg)
126+
127+
# keep non-duplicated url references
128+
else:
129+
filename = os.path.split(url)[1]
130+
dataset_keep.append({
131+
'type': file_extension,
132+
'file': cStringIO.StringIO(
133+
urllib.urlopen(url).read()
134+
),
135+
'filename': filename
136+
})
137+
138+
except:
139+
msg = 'Problem with url reference ' + url
140+
msg += '. Please re-upload the information.'
141+
list_error.append(msg)
142+
143+
# define unique 'file reference(s)'
144+
dataset['file_upload'][:] = dataset_keep
145+
89146
else:
90147
msg = 'No file(s) were uploaded'
91148
list_error.append(msg)
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"properties": {
3+
"dataset_type": "dataset_url",
4+
"session_type": "data_append",
5+
"model_type": "svm",
6+
"session_id": "1"
7+
},
8+
"dataset": [
9+
"https://raw.githubusercontent.com/jeff1evesque/machine-learning/master/interface/static/data/json/web_interface/svm.json",
10+
"https://raw.githubusercontent.com/jeff1evesque/machine-learning/master/interface/static/data/json/web_interface/svm-1.json"
11+
]
12+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"properties": {
3+
"session_name": "sample_svm_title",
4+
"dataset_type": "dataset_url",
5+
"session_type": "data_new",
6+
"model_type": "svm"
7+
},
8+
"dataset": [
9+
"https://raw.githubusercontent.com/jeff1evesque/machine-learning/master/interface/static/data/json/web_interface/svm.json",
10+
"https://raw.githubusercontent.com/jeff1evesque/machine-learning/master/interface/static/data/json/web_interface/svm-1.json"
11+
]
12+
}
File renamed without changes.
File renamed without changes.

interface/static/data/json/programmatic_interface/svm/svm-data-append.json renamed to interface/static/data/json/programmatic_interface/svm/file_upload/svm-data-append.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
{
22
"properties": {
3-
"dataset_type": "json_string",
3+
"dataset_type": "file_upload",
44
"session_type": "data_append",
55
"model_type": "svm",
6-
"session_id": "1"
6+
"session_id": "3"
77
},
88
"dataset": {
99
"dep-variable-1": [

interface/static/data/json/programmatic_interface/svm/svm-data-new.json renamed to interface/static/data/json/programmatic_interface/svm/file_upload/svm-data-new.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"properties": {
33
"session_name": "sample_svm_title",
4-
"dataset_type": "json_string",
4+
"dataset_type": "file_upload",
55
"session_type": "data_new",
66
"model_type": "svm"
77
},
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"properties": {
3+
"session_type": "model_generate",
4+
"session_id": "3",
5+
"model_type": "svm",
6+
"sv_kernel_type": "rbf"
7+
}
8+
}

0 commit comments

Comments
 (0)