Introduction to Dask#
What is Dask?#
A flexible parallel computing library for analytics.
Scales workflows from a single machine to a cluster.
Integrates with existing Python libraries like NumPy, Pandas, and Scikit-learn.
Handles larger-than-memory datasets.
Dynamic task scheduling for complex workflows.
Core Concepts#
dask.array: Parallel, chunked NumPy arrays.
dask.dataframe: Parallel, chunked Pandas DataFrames.
dask.bag: General-purpose parallel lists.

import dask.dataframe as dd
from dask.distributed import Client
client = Client(memory_limit="2GB")
client
Client
Client-9a4b745e-5014-11f0-bf79-d6aeee621b7f
| Connection method: Cluster object | Cluster type: distributed.LocalCluster |
| Dashboard: http://127.0.0.1:8787/status |
Cluster Info
LocalCluster
7b4b0110
| Dashboard: http://127.0.0.1:8787/status | Workers: 4 |
| Total threads: 8 | Total memory: 7.45 GiB |
| Status: running | Using processes: True |
Scheduler Info
Scheduler
Scheduler-e36e2f66-f0a5-4356-9087-e5f1b29a2189
| Comm: tcp://127.0.0.1:57558 | Workers: 4 |
| Dashboard: http://127.0.0.1:8787/status | Total threads: 8 |
| Started: Just now | Total memory: 7.45 GiB |
Workers
Worker: 0
| Comm: tcp://127.0.0.1:57571 | Total threads: 2 |
| Dashboard: http://127.0.0.1:57575/status | Memory: 1.86 GiB |
| Nanny: tcp://127.0.0.1:57561 | |
| Local directory: /var/folders/j3/513qxyhx4l30byl48tz1k1jr0000gn/T/dask-scratch-space/worker-hbn7qbl4 | |
Worker: 1
| Comm: tcp://127.0.0.1:57569 | Total threads: 2 |
| Dashboard: http://127.0.0.1:57573/status | Memory: 1.86 GiB |
| Nanny: tcp://127.0.0.1:57563 | |
| Local directory: /var/folders/j3/513qxyhx4l30byl48tz1k1jr0000gn/T/dask-scratch-space/worker-b76s2h7q | |
Worker: 2
| Comm: tcp://127.0.0.1:57570 | Total threads: 2 |
| Dashboard: http://127.0.0.1:57574/status | Memory: 1.86 GiB |
| Nanny: tcp://127.0.0.1:57565 | |
| Local directory: /var/folders/j3/513qxyhx4l30byl48tz1k1jr0000gn/T/dask-scratch-space/worker-9ooy6grx | |
Worker: 3
| Comm: tcp://127.0.0.1:57572 | Total threads: 2 |
| Dashboard: http://127.0.0.1:57576/status | Memory: 1.86 GiB |
| Nanny: tcp://127.0.0.1:57567 | |
| Local directory: /var/folders/j3/513qxyhx4l30byl48tz1k1jr0000gn/T/dask-scratch-space/worker-s5hhzhul | |
2025-06-23 11:30:40,150 - distributed.shuffle._scheduler_plugin - WARNING - Shuffle b525a3b09e7b8099527f66bb6193bd0b initialized by task ('shuffle-transfer-b525a3b09e7b8099527f66bb6193bd0b', 288) executed on worker tcp://127.0.0.1:57571
2025-06-23 11:32:55,090 - distributed.dashboard.components.scheduler - ERROR - Circle(id='p20029', ...).radius doesn't have a value set
Traceback (most recent call last):
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/distributed/utils.py", line 811, in wrapper
return func(*args, **kwargs)
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/distributed/dashboard/components/scheduler.py", line 4807, in profile_doc
doc.add_root(prof.root)
~~~~~~~~~~~~^^^^^^^^^^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/document/document.py", line 321, in add_root
with self.models.freeze():
~~~~~~~~~~~~~~~~~~^^
File "/opt/homebrew/Cellar/python@3.13/3.13.5/Frameworks/Python.framework/Versions/3.13/lib/python3.13/contextlib.py", line 148, in __exit__
next(self.gen)
~~~~^^^^^^^^^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/document/models.py", line 135, in freeze
self._pop_freeze()
~~~~~~~~~~~~~~~~^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/document/models.py", line 288, in _pop_freeze
self.recompute()
~~~~~~~~~~~~~~^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/document/models.py", line 215, in recompute
new_models |= mr.references()
~~~~~~~~~~~~~^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/model/model.py", line 492, in references
return set(collect_models(self))
~~~~~~~~~~~~~~^^^^^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/model/util.py", line 139, in collect_models
return collect_filtered_models(None, *input_values)
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/model/util.py", line 119, in collect_filtered_models
visit_immediate_value_references(obj, queue_one)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/model/util.py", line 185, in visit_immediate_value_references
child = getattr(value, attr)
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/core/property/descriptors.py", line 282, in __get__
raise UnsetValueError(f"{obj}.{self.name} doesn't have a value set")
bokeh.core.property.descriptors.UnsetValueError: Circle(id='p20029', ...).radius doesn't have a value set
2025-06-23 11:32:55,093 - tornado.application - ERROR - Uncaught exception GET /profile (127.0.0.1)
HTTPServerRequest(protocol='http', host='127.0.0.1:8787', method='GET', uri='/profile', version='HTTP/1.1', remote_ip='127.0.0.1')
Traceback (most recent call last):
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/tornado/web.py", line 1790, in _execute
result = await result
^^^^^^^^^^^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/server/views/doc_handler.py", line 54, in get
session = await self.get_session()
^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/server/views/session_handler.py", line 145, in get_session
session = await self.application_context.create_session_if_needed(session_id, self.request, token)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/server/contexts.py", line 240, in create_session_if_needed
self._application.initialize_document(doc)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/application/application.py", line 190, in initialize_document
h.modify_document(doc)
~~~~~~~~~~~~~~~~~^^^^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/application/handlers/function.py", line 140, in modify_document
self._func(doc)
~~~~~~~~~~^^^^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/distributed/utils.py", line 811, in wrapper
return func(*args, **kwargs)
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/distributed/dashboard/components/scheduler.py", line 4807, in profile_doc
doc.add_root(prof.root)
~~~~~~~~~~~~^^^^^^^^^^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/document/document.py", line 321, in add_root
with self.models.freeze():
~~~~~~~~~~~~~~~~~~^^
File "/opt/homebrew/Cellar/python@3.13/3.13.5/Frameworks/Python.framework/Versions/3.13/lib/python3.13/contextlib.py", line 148, in __exit__
next(self.gen)
~~~~^^^^^^^^^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/document/models.py", line 135, in freeze
self._pop_freeze()
~~~~~~~~~~~~~~~~^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/document/models.py", line 288, in _pop_freeze
self.recompute()
~~~~~~~~~~~~~~^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/document/models.py", line 215, in recompute
new_models |= mr.references()
~~~~~~~~~~~~~^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/model/model.py", line 492, in references
return set(collect_models(self))
~~~~~~~~~~~~~~^^^^^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/model/util.py", line 139, in collect_models
return collect_filtered_models(None, *input_values)
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/model/util.py", line 119, in collect_filtered_models
visit_immediate_value_references(obj, queue_one)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/model/util.py", line 185, in visit_immediate_value_references
child = getattr(value, attr)
File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/core/property/descriptors.py", line 282, in __get__
raise UnsetValueError(f"{obj}.{self.name} doesn't have a value set")
bokeh.core.property.descriptors.UnsetValueError: Circle(id='p20029', ...).radius doesn't have a value set
2025-06-23 11:46:09,081 - distributed.shuffle._scheduler_plugin - WARNING - Shuffle b525a3b09e7b8099527f66bb6193bd0b deactivated due to stimulus 'task-finished-1750671969.0796459'
storage_options = {
"client_kwargs": {"endpoint_url": "https://object-store.os-api.cci1.ecmwf.int"},
"use_ssl": True,
"anon": True
}
ddf = dd.read_csv(
"s3://MoBucket/copernicus/*/*/*.csv",
sep = ";",
storage_options=storage_options,
)
ddf["Tile"] = ddf["Name"].str.extract(r"_T(\d{2}[A-Z]{3})_")[0]
result = ddf.groupby("Tile").size().compute()
result = result.reset_index(name="image_count")
print(result.head())
Tile image_count
0 31UGS 2831
1 23XMG 8694
2 32WMT 2165
3 32WPD 1897
4 28QEL 1820
client.close()
DuckDB a Powerful OLAP Database vs. Dask#
import duckdb
import s3fs
# Step 1: List CSV files using s3fs
fs = s3fs.S3FileSystem(
anon=True,
client_kwargs={"endpoint_url": "https://object-store.os-api.cci1.ecmwf.int"},
)
file_list = fs.glob(
"MoBucket/copernicus/**/*.csv"
)
# Prepend 's3://' so DuckDB can read them
file_paths = [f"s3://{path}" for path in file_list]
if not file_paths:
raise RuntimeError("No CSV files found — check pattern or file structure.")
# Step 2: Use DuckDB to query
con = duckdb.connect()
query = f"""
CREATE OR REPLACE SECRET secret (
TYPE s3,
ENDPOINT 'object-store.os-api.cci1.ecmwf.int',
USE_SSL true,
URL_STYLE 'path'
);
"""
con.execute(query)
# Step 3: Run the query using the resolved file list
query = f"""
SELECT
REGEXP_EXTRACT(Name, '_T(\\d{{2}}[A-Z]{{3}})_', 1) AS Tile,
COUNT(*) AS image_count
FROM read_csv_auto({file_paths}, delim=';')
GROUP BY Tile
"""
df = con.execute(query).df()
print(df.head())
Tile image_count
0 53LLD 1767
1 33SWR 1878
2 31UCQ 2769
3 33STA 1884
4 34UCC 1897
df[df["Tile"] == "31UGS"]
| Tile | image_count | |
|---|---|---|
| 1897 | 31UGS | 2831 |
con.close()