Skip to content

Generating SQL

Suppose you want to write Polars syntax and translate it to SQL. For example, what's the SQL equivalent to:

import narwhals as nw
from narwhals.typing import IntoFrameT


def avg_monthly_price(df_native: IntoFrameT) -> IntoFrameT:
    return (
        nw.from_native(df_native)
        .group_by(nw.col("date").dt.truncate("1mo"))
        .agg(nw.col("price").mean())
        .sort("date")
        .to_native()
    )

?

There are several ways to find out.

Via SQLFrame (most lightweight solution)

The most lightweight solution which does not require any heavy dependencies, nor any actual table or dataframe, is with SQLFrame.

from sqlframe.standalone import StandaloneSession

session = StandaloneSession.builder.getOrCreate()
session.catalog.add_table("prices", column_mapping={"date": "date", "price": "float"})
df = nw.from_native(session.read.table("prices"))

print(avg_monthly_price(df).sql(dialect="duckdb"))
WITH "t20239543" AS (
  SELECT
    CAST(DATE_TRUNC('MONTH', CAST("prices"."date" AS TIMESTAMPTZ)) AS TIMESTAMPTZ) AS "_date_tmp",
    AVG("prices"."price") AS "price"
  FROM "prices" AS "prices"
  GROUP BY
    CAST(DATE_TRUNC('MONTH', CAST("prices"."date" AS TIMESTAMPTZ)) AS TIMESTAMPTZ)
)
SELECT
  "t20239543"."_date_tmp" AS "date",
  "t20239543"."price" AS "price"
FROM "t20239543" AS "t20239543"
ORDER BY
  "date" NULLS FIRST

Or, to print the SQL code in a different dialect (say, databricks):

print(avg_monthly_price(df).sql(dialect="databricks"))
WITH `t20239543` AS (
  SELECT
    CAST(DATE_TRUNC('MONTH', CAST(`prices`.`date` AS TIMESTAMP)) AS TIMESTAMP) AS `_date_tmp`,
    AVG(`prices`.`price`) AS `price`
  FROM `prices` AS `prices`
  GROUP BY
    CAST(DATE_TRUNC('MONTH', CAST(`prices`.`date` AS TIMESTAMP)) AS TIMESTAMP)
)
SELECT
  `t20239543`.`_date_tmp` AS `date`,
  `t20239543`.`price` AS `price`
FROM `t20239543` AS `t20239543`
ORDER BY
  `date`

Via DuckDB

You can also generate SQL directly from DuckDB.

import duckdb

conn = duckdb.connect()
conn.sql("""CREATE TABLE prices (date DATE, price DOUBLE);""")

df = nw.from_native(conn.table("prices"))
print(avg_monthly_price(df).sql_query())
SELECT * FROM (SELECT _date_tmp AS date, price FROM (SELECT _date_tmp, mean(price) AS price FROM (SELECT date, price, time_bucket('1 month', date, '1970-01-01 00:00:00'::TIMESTAMP) AS _date_tmp FROM main.prices) AS prices GROUP BY ALL) AS prices) AS prices ORDER BY date NULLS FIRST

To make it look a bit prettier, we can pass it to SQLGlot:

import sqlglot

print(sqlglot.transpile(avg_monthly_price(df).sql_query(), pretty=True)[0])
SELECT
  *
FROM (
  SELECT
    _date_tmp AS date,
    price
  FROM (
    SELECT
      _date_tmp,
      MEAN(price) AS price
    FROM (
      SELECT
        date,
        price,
        TIME_BUCKET('1 month', date, CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AS _date_tmp
      FROM main.prices
    ) AS prices
    GROUP BY ALL
  ) AS prices
) AS prices
ORDER BY
  date

Via Ibis

We can also use Ibis to generate SQL:

import ibis

t = ibis.table({"date": "date", "price": "double"}, name="prices")
print(ibis.to_sql(avg_monthly_price(t)))
SELECT
  "t2"."_date_tmp" AS "date",
  "t2"."price"
FROM (
  SELECT
    "t1"."_date_tmp",
    AVG("t1"."price") AS "price"
  FROM (
    SELECT
      "t0"."date",
      "t0"."price",
      DATE_TRUNC('MONTH', "t0"."date") AS "_date_tmp"
    FROM "prices" AS "t0"
  ) AS "t1"
  GROUP BY
    1
) AS "t2"
ORDER BY
  "t2"."_date_tmp" ASC NULLS FIRST