Skip to main content

Analyze local files with clickhouse-local

clickhouse-local is like running a temporary ClickHouse server that only lasts for your session. It's great for exploring local files to quickly experiment with data, without needing to set up a proper ClickHouse deployment.

It's possible to use to clickhouse-local to analyze files of structured data directly from the local file system.

SELECT count() FROM file('final.ndjson');

SELECT count()
FROM file('final.ndjson')

Query id: a0a1f4b5-40cb-4125-b68b-4ed978c41576

┌─count()─┐
100000
└─────────┘

1 row in set. Elapsed: 0.659 sec. Processed 55.38 thousand rows, 96.97 MB (84.04 thousand rows/s., 147.16 MB/s.)


SELECT countDistinct(public_ip) FROM file('final.ndjson');

SELECT countDistinct(public_ip)
FROM file('final.ndjson')

Query id: 21df7ca5-e3bf-4010-b2a0-bf8b854502d2

┌─uniqExact(public_ip)─┐
71
└──────────────────────┘

1 row in set. Elapsed: 0.225 sec. Processed 77.53 thousand rows, 96.45 MB (345.22 thousand rows/s., 429.46 MB/s.)

You can create tables from the local file if you want to do more than one analysis on the data. The table is destroyed when your clickhouse-local session ends.

CREATE TABLE auxiliar Engine=MergeTree() ORDER BY tuple() AS SELECT * FROM file('final.ndjson');

CREATE TABLE auxiliar
ENGINE = MergeTree
ORDER BY tuple() AS
SELECT *
FROM file('final.ndjson')

Query id: a1732be5-a912-41a5-bf8e-e524db8f12f4

Ok.

0 rows in set. Elapsed: 0.486 sec. Processed 100.00 thousand rows, 161.88 MB (205.73 thousand rows/s., 333.03 MB/s.)


SHOW CREATE TABLE auxiliar;

SHOW CREATE TABLE auxiliar

Query id: dffbcd4b-2c08-4d07-916c-b8e1b668c202


CREATE TABLE _local.auxiliar
(
`timestamp_iso8601` Nullable(DateTime64(9)),
`host` Nullable(String),
`public_ip` Nullable(String),
`request_method` Nullable(String),
`request_path` Nullable(String),
`status` Nullable(Int64),
`body_bytes_sent` Nullable(Int64),
`request_length` Nullable(Int64),
`first_byte` Nullable(Float64),
`request_time` Nullable(Float64),
`lambda_name` Nullable(String),
`lambda_region` Nullable(String),
`path_type` Nullable(String),
`hit_level` Nullable(String),
`hit_state` Nullable(String),
`error_details` Nullable(String),
`owner_id` Nullable(String),
`project_id` Nullable(String),
`target_path` Nullable(String),
`deployment_plan` Nullable(String),
`lambda_duration` Nullable(Float64),
`lambda_billed_duration` Nullable(Int64),
`lambda_memory_size` Nullable(Int64),
`http_user_agent` Nullable(String),
`full_vercel_id` Nullable(String),
`dc` Nullable(String),
`public_ip_country` Nullable(String),
`public_ip_city` Nullable(String),
`asn_id` Nullable(String),
`asn_name` Nullable(String)
)
ENGINE = MergeTree
ORDER BY tuple()
SETTINGS index_granularity = 8192


1 row in set. Elapsed: 0.001 sec.

SELECT count(), status - status % 100 AS status_range FROM auxiliar GROUP BY status_range;

SELECT
count(),
status - (status % 100) AS status_range
FROM auxiliar
GROUP BY status_range

Query id: 2685e0d4-827a-4306-8598-5d6e589dbd15

┌─count()─┬─status_range─┐
74000200
5000400
21000300
└─────────┴──────────────┘

3 rows in set. Elapsed: 0.015 sec.