Introduction to Dask#

dask

What is Dask?#

  • A flexible parallel computing library for analytics.

  • Scales workflows from a single machine to a cluster.

  • Integrates with existing Python libraries like NumPy, Pandas, and Scikit-learn.

  • Handles larger-than-memory datasets.

  • Dynamic task scheduling for complex workflows.

Core Concepts#

  • dask.array: Parallel, chunked NumPy arrays.

  • dask.dataframe: Parallel, chunked Pandas DataFrames.

  • dask.bag: General-purpose parallel lists.

dask cluster

import dask.dataframe as dd
from dask.distributed import Client
client = Client(memory_limit="2GB")
client

Client

Client-9a4b745e-5014-11f0-bf79-d6aeee621b7f

Connection method: Cluster object Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

2025-06-23 11:30:40,150 - distributed.shuffle._scheduler_plugin - WARNING - Shuffle b525a3b09e7b8099527f66bb6193bd0b initialized by task ('shuffle-transfer-b525a3b09e7b8099527f66bb6193bd0b', 288) executed on worker tcp://127.0.0.1:57571
2025-06-23 11:32:55,090 - distributed.dashboard.components.scheduler - ERROR - Circle(id='p20029', ...).radius doesn't have a value set
Traceback (most recent call last):
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/distributed/utils.py", line 811, in wrapper
    return func(*args, **kwargs)
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/distributed/dashboard/components/scheduler.py", line 4807, in profile_doc
    doc.add_root(prof.root)
    ~~~~~~~~~~~~^^^^^^^^^^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/document/document.py", line 321, in add_root
    with self.models.freeze():
         ~~~~~~~~~~~~~~~~~~^^
  File "/opt/homebrew/Cellar/python@3.13/3.13.5/Frameworks/Python.framework/Versions/3.13/lib/python3.13/contextlib.py", line 148, in __exit__
    next(self.gen)
    ~~~~^^^^^^^^^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/document/models.py", line 135, in freeze
    self._pop_freeze()
    ~~~~~~~~~~~~~~~~^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/document/models.py", line 288, in _pop_freeze
    self.recompute()
    ~~~~~~~~~~~~~~^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/document/models.py", line 215, in recompute
    new_models |= mr.references()
                  ~~~~~~~~~~~~~^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/model/model.py", line 492, in references
    return set(collect_models(self))
               ~~~~~~~~~~~~~~^^^^^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/model/util.py", line 139, in collect_models
    return collect_filtered_models(None, *input_values)
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/model/util.py", line 119, in collect_filtered_models
    visit_immediate_value_references(obj, queue_one)
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/model/util.py", line 185, in visit_immediate_value_references
    child = getattr(value, attr)
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/core/property/descriptors.py", line 282, in __get__
    raise UnsetValueError(f"{obj}.{self.name} doesn't have a value set")
bokeh.core.property.descriptors.UnsetValueError: Circle(id='p20029', ...).radius doesn't have a value set
2025-06-23 11:32:55,093 - tornado.application - ERROR - Uncaught exception GET /profile (127.0.0.1)
HTTPServerRequest(protocol='http', host='127.0.0.1:8787', method='GET', uri='/profile', version='HTTP/1.1', remote_ip='127.0.0.1')
Traceback (most recent call last):
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/tornado/web.py", line 1790, in _execute
    result = await result
             ^^^^^^^^^^^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/server/views/doc_handler.py", line 54, in get
    session = await self.get_session()
              ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/server/views/session_handler.py", line 145, in get_session
    session = await self.application_context.create_session_if_needed(session_id, self.request, token)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/server/contexts.py", line 240, in create_session_if_needed
    self._application.initialize_document(doc)
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/application/application.py", line 190, in initialize_document
    h.modify_document(doc)
    ~~~~~~~~~~~~~~~~~^^^^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/application/handlers/function.py", line 140, in modify_document
    self._func(doc)
    ~~~~~~~~~~^^^^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/distributed/utils.py", line 811, in wrapper
    return func(*args, **kwargs)
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/distributed/dashboard/components/scheduler.py", line 4807, in profile_doc
    doc.add_root(prof.root)
    ~~~~~~~~~~~~^^^^^^^^^^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/document/document.py", line 321, in add_root
    with self.models.freeze():
         ~~~~~~~~~~~~~~~~~~^^
  File "/opt/homebrew/Cellar/python@3.13/3.13.5/Frameworks/Python.framework/Versions/3.13/lib/python3.13/contextlib.py", line 148, in __exit__
    next(self.gen)
    ~~~~^^^^^^^^^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/document/models.py", line 135, in freeze
    self._pop_freeze()
    ~~~~~~~~~~~~~~~~^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/document/models.py", line 288, in _pop_freeze
    self.recompute()
    ~~~~~~~~~~~~~~^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/document/models.py", line 215, in recompute
    new_models |= mr.references()
                  ~~~~~~~~~~~~~^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/model/model.py", line 492, in references
    return set(collect_models(self))
               ~~~~~~~~~~~~~~^^^^^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/model/util.py", line 139, in collect_models
    return collect_filtered_models(None, *input_values)
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/model/util.py", line 119, in collect_filtered_models
    visit_immediate_value_references(obj, queue_one)
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/model/util.py", line 185, in visit_immediate_value_references
    child = getattr(value, attr)
  File "/Users/syam/virtualenvs/myvenv/lib/python3.13/site-packages/bokeh/core/property/descriptors.py", line 282, in __get__
    raise UnsetValueError(f"{obj}.{self.name} doesn't have a value set")
bokeh.core.property.descriptors.UnsetValueError: Circle(id='p20029', ...).radius doesn't have a value set
2025-06-23 11:46:09,081 - distributed.shuffle._scheduler_plugin - WARNING - Shuffle b525a3b09e7b8099527f66bb6193bd0b deactivated due to stimulus 'task-finished-1750671969.0796459'
storage_options = {
    "client_kwargs": {"endpoint_url": "https://object-store.os-api.cci1.ecmwf.int"},
    "use_ssl": True,
    "anon": True
}

ddf = dd.read_csv(
    "s3://MoBucket/copernicus/*/*/*.csv",
    sep = ";",
    storage_options=storage_options,
)
ddf["Tile"] = ddf["Name"].str.extract(r"_T(\d{2}[A-Z]{3})_")[0]
result = ddf.groupby("Tile").size().compute()
result = result.reset_index(name="image_count")
print(result.head())
    Tile  image_count
0  31UGS         2831
1  23XMG         8694
2  32WMT         2165
3  32WPD         1897
4  28QEL         1820
client.close()

DuckDB a Powerful OLAP Database vs. Dask#

import duckdb
import s3fs

# Step 1: List CSV files using s3fs
fs = s3fs.S3FileSystem(
    anon=True,
    client_kwargs={"endpoint_url": "https://object-store.os-api.cci1.ecmwf.int"},
)
file_list = fs.glob(
    "MoBucket/copernicus/**/*.csv"
)

# Prepend 's3://' so DuckDB can read them
file_paths = [f"s3://{path}" for path in file_list]

if not file_paths:
    raise RuntimeError("No CSV files found — check pattern or file structure.")

# Step 2: Use DuckDB to query
con = duckdb.connect()
query = f"""
    CREATE OR REPLACE SECRET secret (
        TYPE s3,
        ENDPOINT 'object-store.os-api.cci1.ecmwf.int',
        USE_SSL true,
        URL_STYLE 'path'
);
"""
con.execute(query)
# Step 3: Run the query using the resolved file list
query = f"""
SELECT
  REGEXP_EXTRACT(Name, '_T(\\d{{2}}[A-Z]{{3}})_', 1) AS Tile,
  COUNT(*) AS image_count
FROM read_csv_auto({file_paths}, delim=';')
GROUP BY Tile
"""

df = con.execute(query).df()
print(df.head())
    Tile  image_count
0  53LLD         1767
1  33SWR         1878
2  31UCQ         2769
3  33STA         1884
4  34UCC         1897
df[df["Tile"] == "31UGS"]
Tile image_count
1897 31UGS 2831
con.close()