1616#
1717# This product includes software developed at
1818# data.world, Inc.(http://data.world/).
19-
19+ import functools
2020import gzip
21+ from concurrent .futures import ThreadPoolExecutor
2122from time import sleep
2223
2324import backoff
@@ -46,6 +47,7 @@ def __init__(self, api_token, **kwargs):
4647 self ._api_url = kwargs .get ('api_url' , 'https://api.data.world/v0' )
4748 self ._conn_timeout = kwargs .get ('connect_timeout' , 3.05 )
4849 self ._read_timeout = kwargs .get ('read_timeout' , 600 )
50+ self ._max_threads = kwargs .get ('max_threads' , 10 )
4951
5052 self ._session = requests .Session ()
5153 default_headers = {
@@ -58,6 +60,11 @@ def __init__(self, api_token, **kwargs):
5860 self ._session .mount (self ._api_url ,
5961 BackoffAdapter (GzipAdapter (HTTPAdapter ())))
6062
63+ # Create a limited thread pool.
64+ self ._executor = ThreadPoolExecutor (
65+ max_workers = self ._max_threads
66+ )
67+
6168 def connection_check (self ):
6269 """Verify network connectivity
6370
@@ -100,7 +107,7 @@ def append_stream(self, owner, dataset, stream, records):
100107 raise convert_requests_exception (e )
101108
102109 async def append_stream_chunked (
103- self , owner , dataset , stream , queue , chunk_size ):
110+ self , owner , dataset , stream , queue , chunk_size , loop ):
104111 """Asynchronously append records to a stream in a data.world dataset
105112
106113 :param owner: User or organization ID of the owner of the dataset
@@ -122,20 +129,33 @@ async def append_stream_chunked(
122129
123130 delayed_exception = None
124131 # noinspection PyTypeChecker
132+ pending_task = None
125133 async for chunk in to_chunks (queue , chunk_size ):
126134 if delayed_exception is None :
127135 try :
128136 logger .info ('Uploading {} records in batch #{} '
129137 'from {} stream ' .format (
130138 len (chunk ), counter .value , stream ))
131- # TODO Invoke append_stream in a separate thread
132- self .append_stream (owner , dataset , stream , chunk )
139+
140+ if pending_task is not None :
141+ # Force chunks to be appended sequentially
142+ await pending_task
143+
144+ # Call API on separate thread
145+ pending_task = loop .run_in_executor (
146+ self ._executor ,
147+ functools .partial (self .append_stream ,
148+ owner , dataset , stream , chunk )
149+ )
133150 counter .increment ()
134151 except Exception as e :
135152 delayed_exception = e
136153 else :
137154 pass # Must exhaust queue
138155
156+ if pending_task is not None :
157+ await pending_task
158+
139159 if delayed_exception is not None :
140160 raise delayed_exception
141161
0 commit comments