diff --git a/.github/workflows/release_python.yml b/.github/workflows/release_python.yml index 59c8f8c9..d7bd04cc 100644 --- a/.github/workflows/release_python.yml +++ b/.github/workflows/release_python.yml @@ -49,6 +49,9 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Generate Python README + run: python bindings/python/generate_readme.py + - name: Install protoc run: sudo apt-get update && sudo apt-get install -y protobuf-compiler @@ -78,6 +81,9 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Generate Python README + run: python3 bindings/python/generate_readme.py + - name: Install protoc (Linux) if: runner.os == 'Linux' run: sudo apt-get update && sudo apt-get install -y protobuf-compiler diff --git a/.gitignore b/.gitignore index 9c585d86..476f84e9 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ Cargo.lock .vscode/ # Python +bindings/python/GENERATED_README.md __pycache__/ *.py[cod] *$py.class @@ -29,4 +30,22 @@ dist/ build/ # CPP -*CMakeFiles/ \ No newline at end of file +*CMakeFiles/ + +# Website (Docusaurus) +website/node_modules +website/build +website/.docusaurus +website/.cache-loader +website/.env.local +website/.env.development.local +website/.env.test.local +website/.env.production.local +website/npm-debug.log* +website/yarn-debug.log* +website/yarn-error.log* +website/package-lock.json +website/versioned_docs +website/versioned_sidebars +website/versions.json +website/pnpm-lock.yaml \ No newline at end of file diff --git a/.licenserc.yaml b/.licenserc.yaml index a3cfcd14..d3238563 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -21,10 +21,11 @@ header: copyright-owner: Apache Software Foundation paths-ignore: - - '.github/PULL_REQUEST_TEMPLATE.md' - '.gitignore' - 'LICENSE' - 'NOTICE' - 'DISCLAIMER' - 'bindings/python/fluss/py.typed' + - 'website/**' + - '**/*.md' comment: on-failure diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index a0669a75..a1180d6f 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -1,21 +1,3 @@ - - # Development Guide Welcome to the development guide of `fluss-rust`! This project builds `fluss-rust` client and language specific bindings. diff --git a/README.md b/README.md index a42c0f36..dafe19c8 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,3 @@ - - # Apache Fluss™ Rust (Incubating) ![Experimental](https://img.shields.io/badge/status-experimental-orange) diff --git a/bindings/cpp/README.md b/bindings/cpp/README.md index 2556a4b6..539e3130 100644 --- a/bindings/cpp/README.md +++ b/bindings/cpp/README.md @@ -1,21 +1,3 @@ - - # Apache Fluss™ C++ Bindings (Incubating) C++ bindings for Fluss, built on top of the [fluss-rust](../../crates/fluss) client. The API is exposed via a C++ header ([include/fluss.hpp](include/fluss.hpp)) and implemented with Rust FFI. diff --git a/bindings/python/API_REFERENCE.md b/bindings/python/API_REFERENCE.md deleted file mode 100644 index 3749ee1d..00000000 --- a/bindings/python/API_REFERENCE.md +++ /dev/null @@ -1,305 +0,0 @@ - - -# Python API Reference - -Complete API reference for the Fluss Python client. For a usage guide with examples, see the [Python Client Guide](README.md). - -## `Config` - -| Method / Property | Description | -|---|---| -| `Config(properties: dict = None)` | Create config from a dict of key-value pairs | -| `.bootstrap_servers` | Get/set coordinator server address | -| `.writer_request_max_size` | Get/set max request size in bytes | -| `.writer_batch_size` | Get/set write batch size in bytes | - -## `FlussConnection` - -| Method | Description | -|---|---| -| `await FlussConnection.create(config) -> FlussConnection` | Connect to a Fluss cluster | -| `await conn.get_admin() -> FlussAdmin` | Get admin interface | -| `await conn.get_table(table_path) -> FlussTable` | Get a table for read/write operations | -| `conn.close()` | Close the connection | - -Supports `with` statement (context manager). - -## `FlussAdmin` - -| Method | Description | -|---|---| -| `await create_database(name, database_descriptor=None, ignore_if_exists=False)` | Create a database | -| `await drop_database(name, ignore_if_not_exists=False, cascade=True)` | Drop a database | -| `await list_databases() -> list[str]` | List all databases | -| `await database_exists(name) -> bool` | Check if a database exists | -| `await get_database_info(name) -> DatabaseInfo` | Get database metadata | -| `await create_table(table_path, table_descriptor, ignore_if_exists=False)` | Create a table | -| `await drop_table(table_path, ignore_if_not_exists=False)` | Drop a table | -| `await get_table_info(table_path) -> TableInfo` | Get table metadata | -| `await list_tables(database_name) -> list[str]` | List tables in a database | -| `await table_exists(table_path) -> bool` | Check if a table exists | -| `await list_offsets(table_path, bucket_ids, offset_type, timestamp=None) -> dict[int, int]` | Get offsets for buckets | -| `await list_partition_offsets(table_path, partition_name, bucket_ids, offset_type, timestamp=None) -> dict[int, int]` | Get offsets for a partition's buckets | -| `await create_partition(table_path, partition_spec, ignore_if_exists=False)` | Create a partition | -| `await drop_partition(table_path, partition_spec, ignore_if_not_exists=False)` | Drop a partition | -| `await list_partition_infos(table_path) -> list[PartitionInfo]` | List partitions | -| `await get_latest_lake_snapshot(table_path) -> LakeSnapshot` | Get latest lake snapshot | - -## `FlussTable` - -| Method | Description | -|---|---| -| `new_scan() -> TableScan` | Create a scan builder | -| `new_append() -> TableAppend` | Create an append builder for log tables | -| `new_upsert() -> TableUpsert` | Create an upsert builder for PK tables | -| `new_lookup() -> TableLookup` | Create a lookup builder for PK tables | -| `get_table_info() -> TableInfo` | Get table metadata | -| `get_table_path() -> TablePath` | Get table path | -| `has_primary_key() -> bool` | Check if table has a primary key | - -## `TableScan` - -| Method | Description | -|---|---| -| `.project(indices) -> TableScan` | Project columns by index | -| `.project_by_name(names) -> TableScan` | Project columns by name | -| `await .create_log_scanner() -> LogScanner` | Create record-based scanner (for `poll()`) | -| `await .create_record_batch_log_scanner() -> LogScanner` | Create batch-based scanner (for `poll_arrow()`, `to_arrow()`, etc.) | - -## `TableAppend` - -Builder for creating an `AppendWriter`. Obtain via `FlussTable.new_append()`. - -| Method | Description | -|---|---| -| `.create_writer() -> AppendWriter` | Create the append writer | - -## `TableUpsert` - -Builder for creating an `UpsertWriter`. Obtain via `FlussTable.new_upsert()`. - -| Method | Description | -|---|---| -| `.partial_update_by_name(columns) -> TableUpsert` | Configure partial update by column names | -| `.partial_update_by_index(indices) -> TableUpsert` | Configure partial update by column indices | -| `.create_writer() -> UpsertWriter` | Create the upsert writer | - -## `TableLookup` - -Builder for creating a `Lookuper`. Obtain via `FlussTable.new_lookup()`. - -| Method | Description | -|---|---| -| `.create_lookuper() -> Lookuper` | Create the lookuper | - -## `AppendWriter` - -| Method | Description | -|---|---| -| `.append(row) -> WriteResultHandle` | Append a row (dict, list, or tuple) | -| `.write_arrow(table)` | Write a PyArrow Table | -| `.write_arrow_batch(batch) -> WriteResultHandle` | Write a PyArrow RecordBatch | -| `.write_pandas(df)` | Write a Pandas DataFrame | -| `await .flush()` | Flush all pending writes | - -## `UpsertWriter` - -| Method | Description | -|---|---| -| `.upsert(row) -> WriteResultHandle` | Upsert a row (insert or update by PK) | -| `.delete(pk) -> WriteResultHandle` | Delete a row by primary key | -| `await .flush()` | Flush all pending operations | - -## `WriteResultHandle` - -| Method | Description | -|---|---| -| `await .wait()` | Wait for server acknowledgment of this write | - -## `Lookuper` - -| Method | Description | -|---|---| -| `await .lookup(pk) -> dict \| None` | Lookup a row by primary key | - -## `LogScanner` - -| Method | Description | -|---|---| -| `.subscribe(bucket_id, start_offset)` | Subscribe to a bucket | -| `.subscribe_buckets(bucket_offsets)` | Subscribe to multiple buckets (`{bucket_id: offset}`) | -| `.subscribe_partition(partition_id, bucket_id, start_offset)` | Subscribe to a partition bucket | -| `.subscribe_partition_buckets(partition_bucket_offsets)` | Subscribe to multiple partition+bucket combos (`{(part_id, bucket_id): offset}`) | -| `.unsubscribe(bucket_id)` | Unsubscribe from a bucket (non-partitioned tables) | -| `.unsubscribe_partition(partition_id, bucket_id)` | Unsubscribe from a partition bucket | -| `.poll(timeout_ms) -> list[ScanRecord]` | Poll individual records (record scanner only) | -| `.poll_arrow(timeout_ms) -> pa.Table` | Poll as Arrow Table (batch scanner only) | -| `.poll_record_batch(timeout_ms) -> list[RecordBatch]` | Poll batches with metadata (batch scanner only) | -| `.to_arrow() -> pa.Table` | Read all subscribed data as Arrow Table (batch scanner only) | -| `.to_pandas() -> pd.DataFrame` | Read all subscribed data as DataFrame (batch scanner only) | - -## `ScanRecord` - -| Property | Description | -|---|---| -| `.bucket -> TableBucket` | Bucket this record belongs to | -| `.offset -> int` | Record offset in the log | -| `.timestamp -> int` | Record timestamp | -| `.change_type -> ChangeType` | Change type (AppendOnly, Insert, UpdateBefore, UpdateAfter, Delete) | -| `.row -> dict` | Row data as `{column_name: value}` | - -## `RecordBatch` - -| Property | Description | -|---|---| -| `.batch -> pa.RecordBatch` | Arrow RecordBatch data | -| `.bucket -> TableBucket` | Bucket this batch belongs to | -| `.base_offset -> int` | First record offset | -| `.last_offset -> int` | Last record offset | - -## `Schema` - -| Method | Description | -|---|---| -| `Schema(schema: pa.Schema, primary_keys=None)` | Create from PyArrow schema | -| `.get_column_names() -> list[str]` | Get column names | -| `.get_column_types() -> list[str]` | Get column type names | - -## `TableDescriptor` - -| Method | Description | -|---|---| -| `TableDescriptor(schema, *, partition_keys=None, bucket_count=None, bucket_keys=None, comment=None, log_format=None, kv_format=None, properties=None, custom_properties=None)` | Create table descriptor | -| `.get_schema() -> Schema` | Get the schema | - -## `TablePath` - -| Method / Property | Description | -|---|---| -| `TablePath(database, table)` | Create a table path | -| `.database_name -> str` | Database name | -| `.table_name -> str` | Table name | - -## `TableInfo` - -| Property / Method | Description | -|---|---| -| `.table_id -> int` | Table ID | -| `.table_path -> TablePath` | Table path | -| `.num_buckets -> int` | Number of buckets | -| `.schema_id -> int` | Schema ID | -| `.comment -> str \| None` | Table comment | -| `.created_time -> int` | Creation timestamp | -| `.modified_time -> int` | Last modification timestamp | -| `.get_primary_keys() -> list[str]` | Primary key columns | -| `.get_partition_keys() -> list[str]` | Partition columns | -| `.get_bucket_keys() -> list[str]` | Bucket key columns | -| `.has_primary_key() -> bool` | Has primary key? | -| `.is_partitioned() -> bool` | Is partitioned? | -| `.get_schema() -> Schema` | Get table schema | -| `.get_column_names() -> list[str]` | Column names | -| `.get_column_count() -> int` | Number of columns | -| `.get_properties() -> dict` | All table properties | -| `.get_custom_properties() -> dict` | Custom properties only | - -## `PartitionInfo` - -| Property | Description | -|---|---| -| `.partition_id -> int` | Partition ID | -| `.partition_name -> str` | Partition name | - -## `DatabaseDescriptor` - -| Method / Property | Description | -|---|---| -| `DatabaseDescriptor(comment=None, custom_properties=None)` | Create descriptor | -| `.comment -> str \| None` | Database comment | -| `.get_custom_properties() -> dict` | Custom properties | - -## `DatabaseInfo` - -| Property / Method | Description | -|---|---| -| `.database_name -> str` | Database name | -| `.created_time -> int` | Creation timestamp | -| `.modified_time -> int` | Last modification timestamp | -| `.get_database_descriptor() -> DatabaseDescriptor` | Get descriptor | - -## `LakeSnapshot` - -| Property / Method | Description | -|---|---| -| `.snapshot_id -> int` | Snapshot ID | -| `.table_buckets_offset -> dict[TableBucket, int]` | All bucket offsets | -| `.get_bucket_offset(bucket) -> int \| None` | Get offset for a bucket | -| `.get_table_buckets() -> list[TableBucket]` | Get all buckets | - -## `TableBucket` - -| Method / Property | Description | -|---|---| -| `TableBucket(table_id, bucket)` | Create non-partitioned bucket | -| `TableBucket.with_partition(table_id, partition_id, bucket)` | Create partitioned bucket | -| `.table_id -> int` | Table ID | -| `.bucket_id -> int` | Bucket ID | -| `.partition_id -> int \| None` | Partition ID (None if non-partitioned) | - -## `FlussError` - -| Property | Description | -|---|---| -| `.message -> str` | Error message | - -Raised for all Fluss-specific errors (connection failures, table not found, schema mismatches, etc.). Inherits from `Exception`. - -## Constants - -| Constant | Value | Description | -|---|---|---| -| `fluss.EARLIEST_OFFSET` | `-2` | Start reading from earliest available offset | -| `fluss.LATEST_OFFSET` | `-1` | Start reading from latest offset (only new records) | -| `fluss.OffsetType.EARLIEST` | `"earliest"` | For `list_offsets()` | -| `fluss.OffsetType.LATEST` | `"latest"` | For `list_offsets()` | -| `fluss.OffsetType.TIMESTAMP` | `"timestamp"` | For `list_offsets()` with timestamp | - -## `ChangeType` - -| Value | Short String | Description | -|---|---|---| -| `ChangeType.AppendOnly` (0) | `+A` | Append-only | -| `ChangeType.Insert` (1) | `+I` | Insert | -| `ChangeType.UpdateBefore` (2) | `-U` | Previous value of updated row | -| `ChangeType.UpdateAfter` (3) | `+U` | New value of updated row | -| `ChangeType.Delete` (4) | `-D` | Delete | - -## Data Types - -| PyArrow Type | Fluss Type | Python Type | -|---|---|---| -| `pa.boolean()` | Boolean | `bool` | -| `pa.int8()` / `int16()` / `int32()` / `int64()` | TinyInt / SmallInt / Int / BigInt | `int` | -| `pa.float32()` / `float64()` | Float / Double | `float` | -| `pa.string()` | String | `str` | -| `pa.binary()` | Bytes | `bytes` | -| `pa.date32()` | Date | `datetime.date` | -| `pa.time32("ms")` | Time | `datetime.time` | -| `pa.timestamp("us")` | Timestamp (NTZ) | `datetime.datetime` | -| `pa.timestamp("us", tz="UTC")` | TimestampLTZ | `datetime.datetime` | -| `pa.decimal128(precision, scale)` | Decimal | `decimal.Decimal` | diff --git a/bindings/python/DEVELOPMENT.md b/bindings/python/DEVELOPMENT.md index e316f5e8..cccd0d1e 100644 --- a/bindings/python/DEVELOPMENT.md +++ b/bindings/python/DEVELOPMENT.md @@ -1,22 +1,3 @@ - - # Development ## Requirements diff --git a/bindings/python/README.md b/bindings/python/README.md index 20c5f552..54a167bc 100644 --- a/bindings/python/README.md +++ b/bindings/python/README.md @@ -18,436 +18,4 @@ # Fluss Python Client -This guide covers how to use the Fluss Python client for reading and writing data to log tables and primary key tables. - -The Python client is async-first, built on top of the Rust core via [PyO3](https://pyo3.rs/), and uses [PyArrow](https://arrow.apache.org/docs/python/) for schema definitions and data interchange. - -## Key Concepts - -- **Log table** — an append-only table (no primary key). Records are immutable once written. Use for event streams, logs, and audit trails. -- **Primary key (PK) table** — a table with a primary key. Supports upsert, delete, and point lookups. -- **Bucket** — the unit of parallelism within a table (similar to Kafka partitions). Each table has one or more buckets. Readers subscribe to individual buckets. -- **Partition** — a way to organize data by column values (e.g. by date or region). Each partition contains its own set of buckets. Partitions must be created explicitly before writing. -- **Offset** — the position of a record within a bucket. Used to track reading progress. Start from `EARLIEST_OFFSET` to read all data, or `LATEST_OFFSET` to only read new records. - -## Prerequisites - -You need a running Fluss cluster to use the Python client. See the [Quick-Start guide](../../README.md#quick-start) for how to start a local cluster. - -## Installation - -```bash -pip install pyfluss -``` - -To build from source instead, see the [Development Guide](DEVELOPMENT.md). - -## Quick Start - -A minimal end-to-end example: connect, create a table, write data, and read it back. Assumes a Fluss cluster is running on `localhost:9123`. - -```python -import asyncio -import pyarrow as pa -import fluss - -async def main(): - # Connect - config = fluss.Config({"bootstrap.servers": "127.0.0.1:9123"}) - conn = await fluss.FlussConnection.create(config) - admin = await conn.get_admin() - - # Create a log table - schema = fluss.Schema(pa.schema([ - pa.field("id", pa.int32()), - pa.field("name", pa.string()), - pa.field("score", pa.float32()), - ])) - table_path = fluss.TablePath("fluss", "quick_start") - await admin.create_table(table_path, fluss.TableDescriptor(schema), ignore_if_exists=True) - - # Write - table = await conn.get_table(table_path) - writer = table.new_append().create_writer() - writer.append({"id": 1, "name": "Alice", "score": 95.5}) - writer.append({"id": 2, "name": "Bob", "score": 87.0}) - await writer.flush() - - # Read - num_buckets = (await admin.get_table_info(table_path)).num_buckets - scanner = await table.new_scan().create_record_batch_log_scanner() - scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) - print(scanner.to_pandas()) - - # Cleanup - await admin.drop_table(table_path, ignore_if_not_exists=True) - conn.close() - -asyncio.run(main()) -``` - -## Connection Setup - -```python -config = fluss.Config({"bootstrap.servers": "127.0.0.1:9123"}) -conn = await fluss.FlussConnection.create(config) -``` - -The connection also supports context managers: - -```python -with await fluss.FlussConnection.create(config) as conn: - ... -``` - -### Configuration Options - -| Key | Description | Default | -|-----|-------------|---------| -| `bootstrap.servers` | Coordinator server address | `127.0.0.1:9123` | -| `request.max.size` | Maximum request size in bytes | `10485760` (10 MB) | -| `writer.acks` | Acknowledgment setting (`all` waits for all replicas) | `all` | -| `writer.retries` | Number of retries on failure | `2147483647` | -| `writer.batch.size` | Batch size for writes in bytes | `2097152` (2 MB) | - -## Admin Operations - -```python -admin = await conn.get_admin() -``` - -### Databases - -```python -await admin.create_database("my_database", ignore_if_exists=True) -databases = await admin.list_databases() -exists = await admin.database_exists("my_database") -await admin.drop_database("my_database", ignore_if_not_exists=True, cascade=True) -``` - -### Tables - -Schemas are defined using PyArrow and wrapped in `fluss.Schema`: - -```python -import pyarrow as pa - -schema = fluss.Schema(pa.schema([ - pa.field("id", pa.int32()), - pa.field("name", pa.string()), - pa.field("amount", pa.int64()), -])) - -table_path = fluss.TablePath("my_database", "my_table") -await admin.create_table(table_path, fluss.TableDescriptor(schema), ignore_if_exists=True) - -table_info = await admin.get_table_info(table_path) -tables = await admin.list_tables("my_database") -await admin.drop_table(table_path, ignore_if_not_exists=True) -``` - -`TableDescriptor` accepts these optional parameters: - -| Parameter | Description | -|---|---| -| `partition_keys` | Column names to partition by (e.g. `["region"]`) | -| `bucket_count` | Number of buckets (parallelism units) for the table | -| `bucket_keys` | Columns used to determine bucket assignment | -| `comment` | Table comment / description | -| `log_format` | Log storage format: `"ARROW"` or `"INDEXED"` | -| `kv_format` | KV storage format for primary key tables: `"INDEXED"` or `"COMPACTED"` | -| `properties` | Table configuration properties as a dict (e.g. `{"table.replication.factor": "1"}`) | -| `custom_properties` | User-defined properties as a dict | - -### Offsets - -```python -# Latest offsets for buckets -offsets = await admin.list_offsets(table_path, bucket_ids=[0, 1], offset_type="latest") - -# By timestamp -offsets = await admin.list_offsets(table_path, bucket_ids=[0], offset_type="timestamp", timestamp=1704067200000) - -# Per-partition offsets -offsets = await admin.list_partition_offsets(table_path, partition_name="US", bucket_ids=[0], offset_type="latest") -``` - -## Log Tables - -Log tables are append-only tables without primary keys, suitable for event streaming. - -### Writing - -Rows can be appended as dicts, lists, or tuples. For bulk writes, use `write_arrow()`, `write_arrow_batch()`, or `write_pandas()`. - -Write methods like `append()` and `write_arrow_batch()` return a `WriteResultHandle`. You can ignore it for fire-and-forget semantics (flush at the end), or `await handle.wait()` to block until the server acknowledges that specific write. - -```python -table = await conn.get_table(table_path) -writer = table.new_append().create_writer() - -# Fire-and-forget: queue writes, flush at the end -writer.append({"id": 1, "name": "Alice", "score": 95.5}) -writer.append([2, "Bob", 87.0]) -await writer.flush() - -# Per-record acknowledgment -handle = writer.append({"id": 3, "name": "Charlie", "score": 91.0}) -await handle.wait() - -# Bulk writes -writer.write_arrow(pa_table) # PyArrow Table -writer.write_arrow_batch(record_batch) # PyArrow RecordBatch -writer.write_pandas(df) # Pandas DataFrame -await writer.flush() -``` - -### Reading - -There are two scanner types: -- **Batch scanner** (`create_record_batch_log_scanner()`) — returns Arrow Tables or DataFrames, best for analytics -- **Record scanner** (`create_log_scanner()`) — returns individual records with metadata (offset, timestamp, change type), best for streaming - -And two reading modes: -- **`to_arrow()` / `to_pandas()`** — reads all data from subscribed buckets up to the current latest offset, then returns. Best for one-shot batch reads. -- **`poll_arrow()` / `poll()` / `poll_record_batch()`** — returns whatever data is available within the timeout, then returns. Call in a loop for continuous streaming. - -#### Batch Read (One-Shot) - -```python -num_buckets = (await admin.get_table_info(table_path)).num_buckets - -scanner = await table.new_scan().create_record_batch_log_scanner() -scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) - -# Reads everything up to current latest offset, then returns -arrow_table = scanner.to_arrow() -df = scanner.to_pandas() -``` - -#### Continuous Polling - -Use `poll_arrow()` or `poll()` in a loop for streaming consumption: - -```python -# Batch scanner: poll as Arrow Tables -scanner = await table.new_scan().create_record_batch_log_scanner() -scanner.subscribe(bucket_id=0, start_offset=fluss.EARLIEST_OFFSET) - -while True: - result = scanner.poll_arrow(timeout_ms=5000) - if result.num_rows > 0: - print(result.to_pandas()) - -# Record scanner: poll individual records with metadata -scanner = await table.new_scan().create_log_scanner() -scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) - -while True: - for record in scanner.poll(timeout_ms=5000): - print(f"offset={record.offset}, change={record.change_type.short_string()}, row={record.row}") -``` - -#### Subscribe from Latest Offset - -To only consume new records (skip existing data), use `LATEST_OFFSET`: - -```python -scanner = await table.new_scan().create_record_batch_log_scanner() -scanner.subscribe(bucket_id=0, start_offset=fluss.LATEST_OFFSET) -``` - -### Column Projection - -```python -scanner = await table.new_scan().project([0, 2]).create_record_batch_log_scanner() -# or by name -scanner = await table.new_scan().project_by_name(["id", "score"]).create_record_batch_log_scanner() -``` - -## Primary Key Tables - -Primary key tables support upsert, delete, and point lookup operations. - -### Creating - -Pass `primary_keys` to `fluss.Schema`: - -```python -schema = fluss.Schema( - pa.schema([ - pa.field("id", pa.int32()), - pa.field("name", pa.string()), - pa.field("age", pa.int64()), - ]), - primary_keys=["id"], -) -table_path = fluss.TablePath("fluss", "users") -await admin.create_table(table_path, fluss.TableDescriptor(schema, bucket_count=3), ignore_if_exists=True) -``` - -### Upsert, Delete, Lookup - -```python -table = await conn.get_table(table_path) - -# Upsert (fire-and-forget, flush at the end) -writer = table.new_upsert() -writer.upsert({"id": 1, "name": "Alice", "age": 25}) -writer.upsert({"id": 2, "name": "Bob", "age": 30}) -await writer.flush() - -# Per-record acknowledgment (for read-after-write) -handle = writer.upsert({"id": 3, "name": "Charlie", "age": 35}) -await handle.wait() - -# Delete by primary key -handle = writer.delete({"id": 2}) -await handle.wait() - -# Lookup -lookuper = table.new_lookup() -result = await lookuper.lookup({"id": 1}) -if result: - print(f"Found: name={result['name']}, age={result['age']}") -``` - -### Partial Updates - -Update specific columns while preserving others: - -```python -partial_writer = table.new_upsert(columns=["id", "age"]) -partial_writer.upsert({"id": 1, "age": 27}) # only updates age -await partial_writer.flush() -``` - -## Partitioned Tables - -Partitioned tables distribute data across partitions based on column values. Partitions must be created before writing. - -### Creating and Managing Partitions - -```python -schema = fluss.Schema(pa.schema([ - pa.field("id", pa.int32()), - pa.field("region", pa.string()), - pa.field("value", pa.int64()), -])) - -table_path = fluss.TablePath("fluss", "partitioned_events") -await admin.create_table( - table_path, - fluss.TableDescriptor(schema, partition_keys=["region"], bucket_count=1), - ignore_if_exists=True, -) - -# Create partitions -await admin.create_partition(table_path, {"region": "US"}, ignore_if_exists=True) -await admin.create_partition(table_path, {"region": "EU"}, ignore_if_exists=True) - -# List partitions -partition_infos = await admin.list_partition_infos(table_path) -``` - -### Writing - -Same as non-partitioned tables — include partition column values in each row: - -```python -table = await conn.get_table(table_path) -writer = table.new_append().create_writer() -writer.append({"id": 1, "region": "US", "value": 100}) -writer.append({"id": 2, "region": "EU", "value": 200}) -await writer.flush() -``` - -### Reading - -Use `subscribe_partition()` or `subscribe_partition_buckets()` instead of `subscribe()`: - -```python -scanner = await table.new_scan().create_record_batch_log_scanner() - -# Subscribe to individual partitions -for p in partition_infos: - scanner.subscribe_partition(partition_id=p.partition_id, bucket_id=0, start_offset=fluss.EARLIEST_OFFSET) - -# Or batch-subscribe -scanner.subscribe_partition_buckets({ - (p.partition_id, 0): fluss.EARLIEST_OFFSET for p in partition_infos -}) - -print(scanner.to_pandas()) -``` - -### Partitioned Primary Key Tables - -Partition columns must be part of the primary key. Partitions must be created before upserting. - -```python -schema = fluss.Schema( - pa.schema([ - pa.field("user_id", pa.int32()), - pa.field("region", pa.string()), - pa.field("score", pa.int64()), - ]), - primary_keys=["user_id", "region"], -) - -table_path = fluss.TablePath("fluss", "partitioned_users") -await admin.create_table( - table_path, - fluss.TableDescriptor(schema, partition_keys=["region"]), - ignore_if_exists=True, -) - -await admin.create_partition(table_path, {"region": "US"}, ignore_if_exists=True) - -table = await conn.get_table(table_path) -writer = table.new_upsert() -writer.upsert({"user_id": 1, "region": "US", "score": 1234}) -await writer.flush() - -# Lookup includes partition columns -lookuper = table.new_lookup() -result = await lookuper.lookup({"user_id": 1, "region": "US"}) -``` - -## Error Handling - -The client raises `fluss.FlussError` for Fluss-specific errors (connection failures, table not found, invalid operations, etc.): - -```python -try: - await admin.create_table(table_path, table_descriptor) -except fluss.FlussError as e: - print(f"Fluss error: {e.message}") -``` - -Common error scenarios: -- **Connection refused** — Fluss cluster is not running or wrong address in `bootstrap.servers` -- **Table not found** — table doesn't exist or wrong database/table name -- **Partition not found** — writing to a partitioned table before creating partitions -- **Schema mismatch** — row data doesn't match the table schema - -## Data Types - -The Python client uses PyArrow types for schema definitions: - -| PyArrow Type | Fluss Type | Python Type | -|---|---|---| -| `pa.boolean()` | Boolean | `bool` | -| `pa.int8()` / `int16()` / `int32()` / `int64()` | TinyInt / SmallInt / Int / BigInt | `int` | -| `pa.float32()` / `float64()` | Float / Double | `float` | -| `pa.string()` | String | `str` | -| `pa.binary()` | Bytes | `bytes` | -| `pa.date32()` | Date | `datetime.date` | -| `pa.time32("ms")` | Time | `datetime.time` | -| `pa.timestamp("us")` | Timestamp (NTZ) | `datetime.datetime` | -| `pa.timestamp("us", tz="UTC")` | TimestampLTZ | `datetime.datetime` | -| `pa.decimal128(precision, scale)` | Decimal | `decimal.Decimal` | - -All Python native types (`date`, `time`, `datetime`, `Decimal`) work when appending rows via dicts. - -For a complete list of classes, methods, and properties, see the [API Reference](API_REFERENCE.md). +For full documentation, see the [Python user guide](../../website/docs/user-guide/python/). diff --git a/bindings/python/generate_readme.py b/bindings/python/generate_readme.py new file mode 100644 index 00000000..206f9e2a --- /dev/null +++ b/bindings/python/generate_readme.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to you under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Generate bindings/python/GENERATED_README.md from the website docs. + +Usage: + python generate_readme.py # writes GENERATED_README.md + python generate_readme.py --check # exits non-zero if GENERATED_README.md is stale +""" + +from __future__ import annotations + +import re +import sys +from pathlib import Path + +SCRIPT_DIR = Path(__file__).resolve().parent +DOCS_DIR = SCRIPT_DIR / "../../website/docs/user-guide/python" + +LICENSE_HEADER = """\ + +""" + +# Files in the order they should appear in the README. +SECTIONS: list[str] = [ + "installation.md", + "example/index.md", + "example/configuration.md", + "example/admin-operations.md", + "example/log-tables.md", + "example/primary-key-tables.md", + "example/partitioned-tables.md", + "error-handling.md", + "data-types.md", + "api-reference.md", +] + +FRONTMATTER_RE = re.compile(r"^---\n.*?^---\n", re.MULTILINE | re.DOTALL) + + +def strip_frontmatter(text: str) -> str: + return FRONTMATTER_RE.sub("", text, count=1) + + +def build_readme() -> str: + parts = [LICENSE_HEADER, "# Fluss Python Client\n"] + + for section in SECTIONS: + path = DOCS_DIR / section + if not path.exists(): + print(f"warning: {path} not found, skipping", file=sys.stderr) + continue + content = strip_frontmatter(path.read_text()).strip() + parts.append(content) + + return "\n\n".join(parts) + "\n" + + +def main() -> None: + readme = build_readme() + dest = SCRIPT_DIR / "GENERATED_README.md" + + if "--check" in sys.argv: + if not dest.exists() or dest.read_text() != readme: + print("GENERATED_README.md is out of date. Run: python generate_readme.py") + sys.exit(1) + print("GENERATED_README.md is up to date.") + return + + dest.write_text(readme) + print(f"Wrote {dest}") + + +if __name__ == "__main__": + main() diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml index 0e61b234..0be25a03 100644 --- a/bindings/python/pyproject.toml +++ b/bindings/python/pyproject.toml @@ -24,7 +24,7 @@ name = "pyfluss" description = "Apache Fluss (incubating) Python Binding" authors = [{name = "Apache Fluss", email = "dev@fluss.apache.org"}] license = {text = "Apache-2.0"} -readme = "README.md" +readme = "GENERATED_README.md" requires-python = ">=3.9" classifiers = [ "License :: OSI Approved :: Apache Software License", diff --git a/crates/fluss/README.md b/crates/fluss/README.md index bee8ce84..aad8de96 100644 --- a/crates/fluss/README.md +++ b/crates/fluss/README.md @@ -1,21 +1,3 @@ - - # Apache Fluss™ Rust Client (Incubating) Rust client library for [Apache Fluss™](https://fluss.apache.org/). This crate provides the core client used by the fluss-rust workspace and by the Python and C++ bindings. diff --git a/docs/creating-a-release.md b/docs/creating-a-release.md index 164b64b7..8b4c868f 100644 --- a/docs/creating-a-release.md +++ b/docs/creating-a-release.md @@ -1,21 +1,3 @@ - - # Creating a Fluss Rust Client Release This document describes in detail how to create a release of the **Fluss clients** (fluss-rust, fluss-python, fluss-cpp) from the [fluss-rust](https://github.com/apache/fluss-rust) repository. It is based on the [Creating a Fluss Release](https://fluss.apache.org/community/how-to-release/creating-a-fluss-release/) guide of the Apache Fluss project and the [release guide of Apache OpenDAL](https://nightlies.apache.org/opendal/opendal-docs-stable/community/release/); releases are source archives plus CI-published crates.io and PyPI. diff --git a/docs/generate-release-note.md b/docs/generate-release-note.md index 1167f4cd..edbc43bd 100644 --- a/docs/generate-release-note.md +++ b/docs/generate-release-note.md @@ -1,21 +1,3 @@ - - # Generate Release Note Use GitHub's **Generate release notes** to produce a draft from merged PRs between tags. Categories (Added, Fixed, Docs, etc.) are configured in [.github/release.yml](../.github/release.yml). diff --git a/docs/rust-client.md b/docs/rust-client.md index 03968829..e69de29b 100644 --- a/docs/rust-client.md +++ b/docs/rust-client.md @@ -1,755 +0,0 @@ - - -# Fluss Rust Client Guide - -This guide covers how to use the Fluss Rust client for reading and writing data to log tables and primary key tables. - -## Adding to Your Project - -The Fluss Rust client is published to [crates.io](https://crates.io/crates/fluss-rs) as `fluss-rs`. The crate's library name is `fluss`, so you import it with `use fluss::...`. - -```toml -[dependencies] -fluss-rs = "0.1" -tokio = { version = "1", features = ["full"] } -``` - -### Feature Flags - -The Fluss crate supports optional storage backends: - -```toml -[dependencies] -# Default: memory and filesystem storage -fluss-rs = "0.1" - -# With S3 storage support -fluss-rs = { version = "0.1", features = ["storage-s3"] } - -# With OSS storage support -fluss-rs = { version = "0.1", features = ["storage-oss"] } - -# All storage backends -fluss-rs = { version = "0.1", features = ["storage-all"] } -``` - -Available features: -- `storage-memory` (default) - In-memory storage -- `storage-fs` (default) - Local filesystem storage -- `storage-s3` - Amazon S3 storage -- `storage-oss` - Alibaba OSS storage -- `storage-all` - All storage backends - -### Alternative: Git or Path Dependency - -For development against unreleased changes, you can depend on the Git repository or a local checkout: - -```toml -[dependencies] -# From Git -fluss = { git = "https://github.com/apache/fluss-rust.git", package = "fluss-rs" } - -# From local path -fluss = { path = "/path/to/fluss-rust/crates/fluss", package = "fluss-rs" } -``` - -> **Note:** When using `git` or `path` dependencies, the `package = "fluss-rs"` field is required so that Cargo resolves the correct package while still allowing `use fluss::...` imports. - -## Building from Source - -### Prerequisites - -- Rust 1.85+ -- Protobuf compiler (`protoc`) - only required when [building from source](#building-from-source) - - -### 1. Clone the Repository - -```bash -git clone https://github.com/apache/fluss-rust.git -cd fluss-rust -``` - -### 2. Install Dependencies - -The Protobuf compiler (`protoc`) is required to build from source. - -#### macOS - -```bash -brew install protobuf -``` - -#### Ubuntu/Debian - -```bash -sudo apt-get install protobuf-compiler -``` - -### 3. Build the Library - -```bash -cargo build --workspace --all-targets -``` - -## Connection Setup - -```rust -use fluss::client::FlussConnection; -use fluss::config::Config; -use fluss::error::Result; - -#[tokio::main] -async fn main() -> Result<()> { - let mut config = Config::default(); - config.bootstrap_servers = "127.0.0.1:9123".to_string(); - - let conn = FlussConnection::new(config).await?; - - // Use the connection... - - Ok(()) -} -``` - -### Configuration Options - -| Option | Description | Default | -|--------|-------------|---------| -| `bootstrap_servers` | Coordinator server address | `127.0.0.1:9123` | -| `writer_request_max_size` | Maximum request size in bytes | 10 MB | -| `writer_acks` | Acknowledgment setting (`all` waits for all replicas) | `all` | -| `writer_retries` | Number of retries on failure | `i32::MAX` | -| `writer_batch_size` | Batch size for writes | 2 MB | - -## Admin Operations - -### Get Admin Interface - -```rust -let admin = conn.get_admin().await?; -``` - -### Database Operations - -```rust -// Create database -admin.create_database("my_database", None, true).await?; - -// List all databases -let databases = admin.list_databases().await?; -println!("Databases: {:?}", databases); - -// Check if database exists -let exists = admin.database_exists("my_database").await?; - -// Get database information -let db_info = admin.get_database_info("my_database").await?; - -// Drop database -admin.drop_database("my_database", true, false).await?; -``` - -### Table Operations - -```rust -use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; - -// Define table schema -let table_descriptor = TableDescriptor::builder() - .schema( - Schema::builder() - .column("id", DataTypes::int()) - .column("name", DataTypes::string()) - .column("amount", DataTypes::bigint()) - .build()?, - ) - .build()?; - -let table_path = TablePath::new("my_database", "my_table"); - -// Create table -admin.create_table(&table_path, &table_descriptor, true).await?; - -// Get table information -let table_info = admin.get_table_info(&table_path).await?; -println!("Table: {}", table_info); - -// List tables in database -let tables = admin.list_tables("my_database").await?; - -// Check if table exists -let exists = admin.table_exists(&table_path).await?; - -// Drop table -admin.drop_table(&table_path, true).await?; -``` - -### Partition Operations - -```rust -use fluss::metadata::PartitionSpec; -use std::collections::HashMap; - -// List all partitions -let partitions = admin.list_partition_infos(&table_path).await?; - -// List partitions matching a spec -let mut filter = HashMap::new(); -filter.insert("year", "2024"); -let spec = PartitionSpec::new(filter); -let partitions = admin.list_partition_infos_with_spec(&table_path, Some(&spec)).await?; - -// Create partition -admin.create_partition(&table_path, &spec, true).await?; - -// Drop partition -admin.drop_partition(&table_path, &spec, true).await?; -``` - -### Offset Operations - -```rust -use fluss::rpc::message::OffsetSpec; - -let bucket_ids = vec![0, 1, 2]; - -// Get earliest offsets -let earliest = admin.list_offsets(&table_path, &bucket_ids, OffsetSpec::Earliest).await?; - -// Get latest offsets -let latest = admin.list_offsets(&table_path, &bucket_ids, OffsetSpec::Latest).await?; - -// Get offsets for a specific timestamp -let timestamp_ms = 1704067200000; // 2024-01-01 00:00:00 UTC -let offsets = admin.list_offsets(&table_path, &bucket_ids, OffsetSpec::Timestamp(timestamp_ms)).await?; - -// Get offsets for a specific partition -let partition_offsets = admin.list_partition_offsets( - &table_path, - "partition_name", - &bucket_ids, - OffsetSpec::Latest, -).await?; -``` - -### Lake Snapshot - -```rust -// Get latest lake snapshot for lakehouse integration -let snapshot = admin.get_latest_lake_snapshot(&table_path).await?; -println!("Snapshot ID: {}", snapshot.snapshot_id); -``` - -## Log Table Operations - -Log tables are append-only tables without primary keys, suitable for event streaming. - -### Creating a Log Table - -```rust -let table_descriptor = TableDescriptor::builder() - .schema( - Schema::builder() - .column("event_id", DataTypes::int()) - .column("event_type", DataTypes::string()) - .column("timestamp", DataTypes::bigint()) - .build()?, - ) - .build()?; - -let table_path = TablePath::new("fluss", "events"); -admin.create_table(&table_path, &table_descriptor, true).await?; -``` - -### Writing to Log Tables - -```rust -use fluss::row::{GenericRow, InternalRow}; - -let table = conn.get_table(&table_path).await?; -let append_writer = table.new_append()?.create_writer()?; - -// Write a single row -let mut row = GenericRow::new(3); -row.set_field(0, 1); // event_id (int) -row.set_field(1, "user_login"); // event_type (string) -row.set_field(2, 1704067200000i64); // timestamp (bigint) - -append_writer.append(&row)?; - -// Write multiple rows -let mut row2 = GenericRow::new(3); -row2.set_field(0, 2); -row2.set_field(1, "page_view"); -row2.set_field(2, 1704067201000i64); - -append_writer.append(&row2)?; - -// Flush to ensure data is persisted -append_writer.flush().await?; -``` - -Write operations (`append`, `upsert`, `delete`) use a **fire-and-forget** pattern for efficient batching. Each call queues the write and returns a `WriteResultFuture` immediately. Call `flush()` to ensure all queued writes are sent to the server. - -If you need per-record acknowledgment, you can await the returned future: - -```rust -// Per-record acknowledgment (blocks until server confirms) -append_writer.append(&row)?.await?; -``` - -### Reading from Log Tables - -```rust -use std::time::Duration; - -let table = conn.get_table(&table_path).await?; -let log_scanner = table.new_scan().create_log_scanner()?; - -// Subscribe to bucket 0 starting from offset 0 -log_scanner.subscribe(0, 0).await?; - -// Poll for records -let records = log_scanner.poll(Duration::from_secs(10)).await?; - -for record in records { - let row = record.row(); - println!( - "event_id={}, event_type={}, timestamp={} @ offset={}", - row.get_int(0), - row.get_string(1), - row.get_long(2), - record.offset() - ); -} -``` - -### Column Projection - -```rust -// Project specific columns by index -let scanner = table.new_scan().project(&[0, 2])?.create_log_scanner()?; - -// Or project by column names -let scanner = table.new_scan().project_by_name(&["event_id", "timestamp"])?.create_log_scanner()?; -``` - -### Subscribe from Specific Offsets - -```rust -use fluss::client::{EARLIEST_OFFSET, LATEST_OFFSET}; - -// Subscribe from earliest available offset -log_scanner.subscribe(0, EARLIEST_OFFSET).await?; - -// Subscribe from latest offset (only new records) -log_scanner.subscribe(0, LATEST_OFFSET).await?; - -// Subscribe from a specific offset -log_scanner.subscribe(0, 42).await?; - -// Subscribe to all buckets -let num_buckets = table.get_table_info().get_num_buckets(); -for bucket_id in 0..num_buckets { - log_scanner.subscribe(bucket_id, 0).await?; -} -``` - -### Subscribe to Multiple Buckets - -```rust -use std::collections::HashMap; - -// Subscribe to multiple buckets at once with specific offsets -let mut bucket_offsets = HashMap::new(); -bucket_offsets.insert(0, 0i64); // bucket 0 from offset 0 -bucket_offsets.insert(1, 100i64); // bucket 1 from offset 100 -log_scanner.subscribe_buckets(&bucket_offsets).await?; -``` - -### Unsubscribe from a Bucket - -```rust -// Unsubscribe from a specific bucket (non-partitioned tables) -log_scanner.unsubscribe(bucket_id).await?; -``` - -### Unsubscribe from a Partition - -```rust -// Unsubscribe from a specific partition bucket -log_scanner.unsubscribe_partition(partition_id, bucket_id).await?; -``` - -## Partitioned Log Tables - -Partitioned tables distribute data across partitions based on partition column values, enabling efficient data organization and querying. - -### Creating a Partitioned Log Table - -```rust -use fluss::metadata::{DataTypes, LogFormat, Schema, TableDescriptor, TablePath}; - -let table_descriptor = TableDescriptor::builder() - .schema( - Schema::builder() - .column("event_id", DataTypes::int()) - .column("event_type", DataTypes::string()) - .column("dt", DataTypes::string()) // partition column - .column("region", DataTypes::string()) // partition column - .build()?, - ) - .partitioned_by(vec!["dt", "region"]) // Define partition columns - .log_format(LogFormat::ARROW) - .build()?; - -let table_path = TablePath::new("fluss", "partitioned_events"); -admin.create_table(&table_path, &table_descriptor, true).await?; -``` - -### Writing to Partitioned Log Tables - -Writing works the same as non-partitioned tables. Include partition column values in each row: - -```rust -let table = conn.get_table(&table_path).await?; -let append_writer = table.new_append()?.create_writer()?; - -// Partition column values determine which partition the record goes to -let mut row = GenericRow::new(4); -row.set_field(0, 1); // event_id -row.set_field(1, "user_login"); // event_type -row.set_field(2, "2024-01-15"); // dt (partition column) -row.set_field(3, "US"); // region (partition column) - -append_writer.append(&row)?; -append_writer.flush().await?; -``` - -### Reading from Partitioned Log Tables - -For partitioned tables, use `subscribe_partition()` instead of `subscribe()`: - -```rust -use std::time::Duration; - -let table = conn.get_table(&table_path).await?; -let admin = conn.get_admin().await?; - -// Get partition information -let partitions = admin.list_partition_infos(&table_path).await?; - -let log_scanner = table.new_scan().create_log_scanner()?; - -// Subscribe to each partition's buckets -for partition_info in &partitions { - let partition_id = partition_info.get_partition_id(); - let num_buckets = table.get_table_info().get_num_buckets(); - - for bucket_id in 0..num_buckets { - log_scanner.subscribe_partition(partition_id, bucket_id, 0).await?; - } -} - -// Poll for records -let records = log_scanner.poll(Duration::from_secs(10)).await?; -for record in records { - println!("Record from partition: {:?}", record.row()); -} -``` - -You can also subscribe to multiple partition-buckets at once: - -```rust -use std::collections::HashMap; - -let mut partition_bucket_offsets = HashMap::new(); -partition_bucket_offsets.insert((partition_id, 0), 0i64); // partition, bucket 0, offset 0 -partition_bucket_offsets.insert((partition_id, 1), 0i64); // partition, bucket 1, offset 0 -log_scanner.subscribe_partition_buckets(&partition_bucket_offsets).await?; -``` - -### Managing Partitions - -```rust -use fluss::metadata::PartitionSpec; -use std::collections::HashMap; - -// Create a partition -let mut partition_values = HashMap::new(); -partition_values.insert("dt", "2024-01-15"); -partition_values.insert("region", "EMEA"); -let spec = PartitionSpec::new(partition_values); -admin.create_partition(&table_path, &spec, true).await?; - -// List all partitions -let partitions = admin.list_partition_infos(&table_path).await?; -for partition in &partitions { - println!( - "Partition: id={}, name={}", - partition.get_partition_id(), - partition.get_partition_name() // Format: "value1$value2" - ); -} - -// List partitions with filter (partial spec) -let mut partial_values = HashMap::new(); -partial_values.insert("dt", "2024-01-15"); -let partial_spec = PartitionSpec::new(partial_values); -let filtered = admin.list_partition_infos_with_spec(&table_path, Some(&partial_spec)).await?; - -// Drop a partition -admin.drop_partition(&table_path, &spec, true).await?; -``` - -## Primary Key Table Operations - -Primary key tables (KV tables) support upsert, delete, and lookup operations. - -### Creating a Primary Key Table - -```rust -let table_descriptor = TableDescriptor::builder() - .schema( - Schema::builder() - .column("id", DataTypes::int()) - .column("name", DataTypes::string()) - .column("age", DataTypes::bigint()) - .primary_key(vec!["id"]) // Define primary key - .build()?, - ) - .build()?; - -let table_path = TablePath::new("fluss", "users"); -admin.create_table(&table_path, &table_descriptor, true).await?; -``` - -### Upserting Records - -```rust -let table = conn.get_table(&table_path).await?; -let table_upsert = table.new_upsert()?; -let upsert_writer = table_upsert.create_writer()?; - -// Insert or update records -for (id, name, age) in [(1, "Alice", 25i64), (2, "Bob", 30), (3, "Charlie", 35)] { - let mut row = GenericRow::new(3); - row.set_field(0, id); - row.set_field(1, name); - row.set_field(2, age); - upsert_writer.upsert(&row)?; -} -upsert_writer.flush().await?; -``` - -### Updating Records - -```rust -// Update existing record (same primary key) -let mut row = GenericRow::new(3); -row.set_field(0, 1); // id (primary key) -row.set_field(1, "Alice"); // name -row.set_field(2, 26i64); // Updated age - -upsert_writer.upsert(&row)?; -upsert_writer.flush().await?; -``` - -### Deleting Records - -```rust -// Delete by primary key (only primary key field needs to be set) -let mut row = GenericRow::new(3); -row.set_field(0, 2); // id of record to delete - -upsert_writer.delete(&row)?; -upsert_writer.flush().await?; -``` - -### Partial Updates - -Update only specific columns while preserving others: - -```rust -// By column indices -let partial_upsert = table_upsert.partial_update(Some(vec![0, 2]))?; -let partial_writer = partial_upsert.create_writer()?; - -let mut row = GenericRow::new(3); -row.set_field(0, 1); // id (primary key, required) -row.set_field(2, 27i64); // age (will be updated) -// name will remain unchanged - -partial_writer.upsert(&row)?; -partial_writer.flush().await?; - -// By column names -let partial_upsert = table_upsert.partial_update_with_column_names(&["id", "age"])?; -let partial_writer = partial_upsert.create_writer()?; -``` - -### Looking Up Records - -```rust -let mut lookuper = table.new_lookup()?.create_lookuper()?; - -// Create a key row (only primary key fields) -let mut key = GenericRow::new(1); -key.set_field(0, 1); // id to lookup - -let result = lookuper.lookup(&key).await?; - -if let Some(row) = result.get_single_row()? { - println!( - "Found: id={}, name={}, age={}", - row.get_int(0), - row.get_string(1), - row.get_long(2) - ); -} else { - println!("Record not found"); -} -``` - -## Partitioned Primary Key Tables - -Partitioned KV tables combine partitioning with primary key operations. Partition columns must be part of the primary key. - -### Creating a Partitioned Primary Key Table - -```rust -use fluss::metadata::{DataTypes, KvFormat, Schema, TableDescriptor, TablePath}; - -let table_descriptor = TableDescriptor::builder() - .schema( - Schema::builder() - .column("user_id", DataTypes::int()) - .column("region", DataTypes::string()) // partition column - .column("zone", DataTypes::bigint()) // partition column - .column("score", DataTypes::bigint()) - // Primary key must include partition columns - .primary_key(vec!["user_id", "region", "zone"]) - .build()?, - ) - .partitioned_by(vec!["region", "zone"]) // Define partition columns - .kv_format(KvFormat::COMPACTED) - .build()?; - -let table_path = TablePath::new("fluss", "partitioned_users"); -admin.create_table(&table_path, &table_descriptor, true).await?; -``` - -### Writing to Partitioned Primary Key Tables - -Upsert and delete operations work the same as non-partitioned KV tables. **Partitions must be created before upserting data.** - -```rust -use fluss::metadata::PartitionSpec; -use std::collections::HashMap; - -let table = conn.get_table(&table_path).await?; - -// Ensure partitions exist before upserting -for (region, zone) in [("APAC", "1"), ("EMEA", "2"), ("US", "3")] { - let mut partition_values = HashMap::new(); - partition_values.insert("region", region); - partition_values.insert("zone", zone); - let spec = PartitionSpec::new(partition_values); - admin.create_partition(&table_path, &spec, true).await?; -} - -let table_upsert = table.new_upsert()?; -let upsert_writer = table_upsert.create_writer()?; - -// Upsert records - partition is determined by partition column values -for (user_id, region, zone, score) in [ - (1001, "APAC", 1i64, 1234i64), - (1002, "EMEA", 2, 2234), - (1003, "US", 3, 3234), -] { - let mut row = GenericRow::new(4); - row.set_field(0, user_id); - row.set_field(1, region); - row.set_field(2, zone); - row.set_field(3, score); - upsert_writer.upsert(&row)?; -} -upsert_writer.flush().await?; - -// Update a record -let mut row = GenericRow::new(4); -row.set_field(0, 1001); -row.set_field(1, "APAC"); -row.set_field(2, 1i64); -row.set_field(3, 5000i64); // Updated score -upsert_writer.upsert(&row)?; -upsert_writer.flush().await?; - -// Delete a record (primary key includes partition columns) -let mut row = GenericRow::new(4); -row.set_field(0, 1002); -row.set_field(1, "EMEA"); -row.set_field(2, 2i64); -upsert_writer.delete(&row)?; -upsert_writer.flush().await?; -``` - -### Looking Up Records in Partitioned Tables - -Lookup requires all primary key columns including partition columns: - -```rust -let mut lookuper = table.new_lookup()?.create_lookuper()?; - -// Key must include all primary key columns (including partition columns) -let mut key = GenericRow::new(3); -key.set_field(0, 1001); // user_id -key.set_field(1, "APAC"); // region (partition column) -key.set_field(2, 1i64); // zone (partition column) - -let result = lookuper.lookup(&key).await?; -if let Some(row) = result.get_single_row()? { - println!("Found: score={}", row.get_long(3)); -} -``` - -> **Note:** Scanning partitioned primary key tables is not supported. Use lookup operations instead. - -## Data Types - -| Fluss Type | Rust Type | Method | -|-----------------|----------------|---------------------------------------------------------------------| -| `BOOLEAN` | `bool` | `get_boolean()`, `set_field(idx, bool)` | -| `TINYINT` | `i8` | `get_byte()`, `set_field(idx, i8)` | -| `SMALLINT` | `i16` | `get_short()`, `set_field(idx, i16)` | -| `INT` | `i32` | `get_int()`, `set_field(idx, i32)` | -| `BIGINT` | `i64` | `get_long()`, `set_field(idx, i64)` | -| `FLOAT` | `f32` | `get_float()`, `set_field(idx, f32)` | -| `DOUBLE` | `f64` | `get_double()`, `set_field(idx, f64)` | -| `CHAR` | `&str` | `get_char(idx, length)`, `set_field(idx, &str)` | -| `STRING` | `&str` | `get_string()`, `set_field(idx, &str)` | -| `DECIMAL` | `Decimal` | `get_decimal(idx, precision, scale)`, `set_field(idx, Decimal)` | -| `DATE` | `Date` | `get_date()`, `set_field(idx, Date)` | -| `TIME` | `Time` | `get_time()`, `set_field(idx, Time)` | -| `TIMESTAMP` | `TimestampNtz` | `get_timestamp_ntz(idx, precision)`, `set_field(idx, TimestampNtz)` | -| `TIMESTAMP_LTZ` | `TimestampLtz` | `get_timestamp_ltz(idx, precision)`, `set_field(idx, TimestampLtz)` | -| `BYTES` | `&[u8]` | `get_bytes()`, `set_field(idx, &[u8])` | -| `BINARY(n)` | `&[u8]` | `get_binary(idx, length)`, `set_field(idx, &[u8])` | - diff --git a/docs/verifying-a-release-candidate.md b/docs/verifying-a-release-candidate.md index e67d4efc..dc70f723 100644 --- a/docs/verifying-a-release-candidate.md +++ b/docs/verifying-a-release-candidate.md @@ -1,21 +1,3 @@ - - # How to Verify a Release Candidate This document describes how to verify a release candidate (RC) of the **Fluss clients** (fluss-rust, fluss-python, fluss-cpp) from the [fluss-rust](https://github.com/apache/fluss-rust) repository. It is intended for anyone participating in the release vote (binding or non-binding) and is based on [Verifying a Fluss Release](https://fluss.apache.org/community/how-to-release/verifying-a-fluss-release/) of the Apache Fluss project, adapted for the fluss-rust source distribution and tooling (Rust, Python, C++). diff --git a/justfile b/justfile index c4e1a763..c2a61168 100644 --- a/justfile +++ b/justfile @@ -26,3 +26,7 @@ release [version]: # Usage: just bump-version e.g. just bump-version 0.1.0 0.1.1 bump-version from to: ./scripts/bump-version.sh {{from}} {{to}} + +# Regenerate bindings/python/GENERATED_README.md from website docs. +generate-python-readme: + python bindings/python/generate_readme.py diff --git a/website/babel.config.js b/website/babel.config.js new file mode 100644 index 00000000..e00595da --- /dev/null +++ b/website/babel.config.js @@ -0,0 +1,3 @@ +module.exports = { + presets: [require.resolve('@docusaurus/core/lib/babel/preset')], +}; diff --git a/website/docs/developer-guide/_category_.json b/website/docs/developer-guide/_category_.json new file mode 100644 index 00000000..cc7b01ab --- /dev/null +++ b/website/docs/developer-guide/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Developer Guide", + "position": 3 +} diff --git a/website/docs/developer-guide/contributing.md b/website/docs/developer-guide/contributing.md new file mode 100644 index 00000000..eced106a --- /dev/null +++ b/website/docs/developer-guide/contributing.md @@ -0,0 +1,126 @@ +# Contributing + +Welcome to the development guide for `fluss-rust`! This project builds the Fluss Rust client and language-specific bindings (Python, C++). + +## Prerequisites + +- Rust 1.85+ (see [rust-toolchain.toml](https://github.com/apache/fluss-rust/blob/main/rust-toolchain.toml)) +- Protobuf compiler (`protoc`) + +Install using your preferred package/version manager: + +```bash +# Using mise +mise install protobuf +mise install rust + +# Using Homebrew (macOS) +brew install protobuf + +# Using apt (Ubuntu/Debian) +sudo apt-get install protobuf-compiler +``` + +## IDE Setup + +We recommend [RustRover](https://www.jetbrains.com/rust/) IDE. + +### Importing the Project + +1. Clone the repository: + ```bash + git clone https://github.com/apache/fluss-rust.git + ``` +2. Open RustRover, go to the `Projects` tab, click `Open`, and navigate to the root directory. +3. Click `Open`. + +### Copyright Profile + +Fluss is an Apache project, every file needs an Apache licence header. To automate this in RustRover: + +1. Go to `Settings` > `Editor` > `Copyright` > `Copyright Profiles`. +2. Add a new profile named `Apache` with this text: + ``` + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + ``` +3. Go to `Editor` > `Copyright` and set `Apache` as the default profile. +4. Go to `Editor` > `Copyright` > `Formatting` > `Rust`, choose `Use custom formatting`, then `Use line comment`. +5. Click `Apply`. + +## Project Structure + +``` +crates/fluss (Fluss Rust client crate) +crates/examples (Rust client examples) +bindings/cpp (C++ bindings) +bindings/python (Python bindings - PyO3) +``` + +## Building and Testing + +### Rust Client + +```bash +# Build everything +cargo build --workspace --all-targets + +# Run unit tests +cargo test --workspace + +# Run integration tests (requires a running Fluss cluster) +RUST_TEST_THREADS=1 cargo test --features integration_tests --workspace + +# Run a single test +cargo test test_name +``` + +### Python Bindings + +```bash +cd bindings/python +pip install maturin +pip install -e ".[dev]" +maturin develop +``` + +### C++ Bindings + +```bash +cd bindings/cpp +mkdir -p build && cd build +cmake .. +cmake --build . +``` + +## License Check (cargo-deny) + +We use [cargo-deny](https://embarkstudios.github.io/cargo-deny/) to ensure all dependency licenses are Apache-compatible: + +```bash +cargo install cargo-deny --locked +cargo deny check licenses +``` + +## Formatting and Clippy + +CI runs formatting and clippy checks. Run these before submitting a PR: + +```bash +cargo fmt --all +cargo clippy --all-targets --fix --allow-dirty --allow-staged +``` diff --git a/website/docs/developer-guide/release.md b/website/docs/developer-guide/release.md new file mode 100644 index 00000000..0b6f3506 --- /dev/null +++ b/website/docs/developer-guide/release.md @@ -0,0 +1,181 @@ +# Release + +This document describes how to create a release of the Fluss clients (fluss-rust, fluss-python, fluss-cpp) from the [fluss-rust](https://github.com/apache/fluss-rust) repository. It follows the [Apache Fluss release guide](https://fluss.apache.org/community/how-to-release/creating-a-fluss-release/) and the [Apache OpenDAL release guide](https://nightlies.apache.org/opendal/opendal-docs-stable/community/release/). + +Publishing software has legal consequences. This guide complements the [Product Release Policy](https://www.apache.org/legal/release-policy.html) and [Release Distribution Policy](https://infra.apache.org/release-distribution.html). + +## Overview + +1. [Decide to release](#decide-to-release) +2. [Prepare for the release](#prepare-for-the-release) +3. [Build a release candidate](#build-a-release-candidate) +4. [Vote on the release candidate](#vote-on-the-release-candidate) +5. [Fix any issues](#fix-any-issues) (if needed, go back to step 3) +6. [Finalize the release](#finalize-the-release) +7. [Promote the release](#promote-the-release) + +## Decide to Release + +Deciding to release and selecting a Release Manager is a consensus-based decision of the community. Anybody can propose a release on the dev mailing list. + +## Prepare for the Release + +### One-Time Setup + +See [Release Manager Preparation](https://fluss.apache.org/community/how-to-release/release-manager-preparation/) for GPG key setup. For fluss-rust you do **not** need Nexus/Maven. + +### Install Rust + +The release script uses `git archive` and `gpg`. Building or verifying the project requires Rust (match [rust-toolchain.toml](https://github.com/apache/fluss-rust/blob/main/rust-toolchain.toml)). The dependency list script requires Python 3.11+. + +```bash +rustc --version +cargo --version +``` + +To use `just release`, install [just](https://github.com/casey/just). Otherwise run `./scripts/release.sh $RELEASE_VERSION`. + +### Set Environment Variables + +```bash +export RELEASE_VERSION="0.1.0" +export RELEASE_TAG="v${RELEASE_VERSION}" +export SVN_RELEASE_DIR="fluss-rust-${RELEASE_VERSION}" +export LAST_VERSION="0.0.9" # omit for the first release +export NEXT_VERSION="0.2.0" +``` + +### Generate Dependencies List + +Required by [ASF release policy](https://www.apache.org/legal/release-policy.html). Do this on `main` before creating the release branch. + +```bash +git checkout main && git pull +python3 scripts/dependencies.py generate +git add **/DEPENDENCIES*.tsv +git commit -m "chore: update dependency list for release ${RELEASE_VERSION}" +git push origin main +``` + +### Create a Release Branch + +```bash +git checkout main && git pull +git checkout -b release-${RELEASE_VERSION} +git push origin release-${RELEASE_VERSION} +``` + +### Bump Version on Main + +```bash +git checkout main && git pull +./scripts/bump-version.sh $RELEASE_VERSION $NEXT_VERSION +git add Cargo.toml +git commit -m "Bump version to ${NEXT_VERSION}" +git push origin main +``` + +## Build a Release Candidate + +### Set RC Variables + +```bash +export RC_NUM="1" +export RC_TAG="v${RELEASE_VERSION}-rc${RC_NUM}" +export SVN_RC_DIR="fluss-rust-${RELEASE_VERSION}-rc${RC_NUM}" +``` + +### Tag and Push + +```bash +git checkout release-${RELEASE_VERSION} && git pull +git tag -s $RC_TAG -m "${RC_TAG}" +git push origin $RC_TAG +``` + +Pushing the tag triggers CI (GitHub Actions: Release Rust, Release Python). + +### Create Source Artifacts + +```bash +just release $RELEASE_VERSION +# Or: ./scripts/release.sh $RELEASE_VERSION +``` + +This creates under `dist/`: +- `fluss-rust-${RELEASE_VERSION}-incubating.tgz` +- `fluss-rust-${RELEASE_VERSION}-incubating.tgz.sha512` +- `fluss-rust-${RELEASE_VERSION}-incubating.tgz.asc` + +Verify: `gpg --verify dist/fluss-rust-${RELEASE_VERSION}-incubating.tgz.asc dist/fluss-rust-${RELEASE_VERSION}-incubating.tgz` + +### Stage to SVN + +```bash +svn checkout https://dist.apache.org/repos/dist/dev/incubator/fluss fluss-dist-dev --depth=immediates +cd fluss-dist-dev +mkdir $SVN_RC_DIR +cp ../dist/fluss-rust-${RELEASE_VERSION}-incubating.* $SVN_RC_DIR/ +svn add $SVN_RC_DIR +svn commit -m "Add fluss-rust ${RELEASE_VERSION} RC${RC_NUM}" +``` + +## Vote on the Release Candidate + +Start a vote on the dev@ mailing list with subject: `[VOTE] Release Apache Fluss clients ${RELEASE_VERSION} (RC${RC_NUM})` + +The vote is open for at least 72 hours. It requires at least 3 PPMC affirmative votes. If the project is in incubation, a second vote on general@incubator.apache.org is required. + +## Fix Any Issues + +If the vote fails: + +1. Fix issues on `main` or the release branch via PRs. +2. Optionally remove the old RC from dist.apache.org dev. +3. Increment `RC_NUM`, recreate tag and artifacts, and repeat. + +## Finalize the Release + +### Push the Release Tag + +```bash +git checkout $RC_TAG +git tag -s $RELEASE_TAG -m "Release fluss-rust, fluss-python, fluss-cpp ${RELEASE_VERSION}" +git push origin $RELEASE_TAG +``` + +### Deploy Source Artifacts + +```bash +svn mv -m "Release fluss-rust ${RELEASE_VERSION}" \ + https://dist.apache.org/repos/dist/dev/incubator/fluss/$SVN_RC_DIR \ + https://dist.apache.org/repos/dist/release/incubator/fluss/$SVN_RELEASE_DIR +``` + +### Verify Published Packages + +- **Rust:** [crates.io/crates/fluss-rs](https://crates.io/crates/fluss-rs) +- **Python:** [PyPI pyfluss](https://pypi.org/project/pyfluss/) +- **C++:** Distributed via the source archive + +### Create GitHub Release + +1. Go to [Releases > New release](https://github.com/apache/fluss-rust/releases/new). +2. Choose tag `$RELEASE_TAG`, target `release-${RELEASE_VERSION}`. +3. Generate release notes, add notable/breaking changes and download links. +4. Publish. + +### Update CHANGELOG.md + +Add an entry for `$RELEASE_VERSION` on `main`. + +## Promote the Release + +- Merge website PRs (release blog, download page). +- Wait 24 hours, then announce on dev@ and announce@apache.org. + +## See Also + +- [Release Manager Preparation](https://fluss.apache.org/community/how-to-release/release-manager-preparation/) +- [How to Verify a Release Candidate](https://github.com/apache/fluss-rust/blob/main/docs/verifying-a-release-candidate.md) +- [ASF Release Policy](https://www.apache.org/legal/release-policy.html) diff --git a/website/docs/index.md b/website/docs/index.md new file mode 100644 index 00000000..7117bcfb --- /dev/null +++ b/website/docs/index.md @@ -0,0 +1,33 @@ +--- +slug: / +sidebar_position: 1 +title: Introduction +--- + +# Introduction + +[Apache Fluss](https://fluss.apache.org/) (incubating) is a streaming storage system built for real-time analytics, serving as the real-time data layer for Lakehouse architectures. + +This documentation covers the **Fluss client libraries** for Rust, Python, and C++, which are developed in the [fluss-rust](https://github.com/apache/fluss-rust) repository. These clients allow you to: + +- **Create and manage** databases, tables, and partitions +- **Write** data to log tables (append-only) and primary key tables (upsert/delete) +- **Read** data via log scanning and key lookups +- **Integrate** with the broader Fluss ecosystem including lakehouse snapshots + +## Client Overview + +| | Rust | Python | C++ | +|------------------------|------------------------------------------------------------|--------------------------|------------------------------------------------| +| **Package** | [fluss-rs](https://crates.io/crates/fluss-rs) on crates.io | Build from source (PyO3) | Build from source (CMake) | +| **Async runtime** | Tokio | asyncio | Synchronous (Tokio runtime managed internally) | +| **Data format** | Arrow RecordBatch / GenericRow | PyArrow / Pandas / dict | Arrow RecordBatch / GenericRow | +| **Log tables** | Read + Write | Read + Write | Read + Write | +| **Primary key tables** | Upsert + Delete + Lookup | Upsert + Delete + Lookup | Upsert + Delete + Lookup | +| **Partitioned tables** | Full support | Write support | Full support | + +## How This Guide Is Organised + +The **User Guide** walks through installation, configuration, and working with each table type across all three languages. Code examples are shown side by side under **Rust**, **Python**, and **C++** headings. + +The **Developer Guide** covers building from source, running tests, and the release process for contributors. diff --git a/website/docs/user-guide/_category_.json b/website/docs/user-guide/_category_.json new file mode 100644 index 00000000..68ea78e7 --- /dev/null +++ b/website/docs/user-guide/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "User Guide", + "position": 2 +} diff --git a/website/docs/user-guide/cpp/_category_.json b/website/docs/user-guide/cpp/_category_.json new file mode 100644 index 00000000..fbdf7a26 --- /dev/null +++ b/website/docs/user-guide/cpp/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "C++", + "position": 3 +} diff --git a/website/docs/user-guide/cpp/api-reference.md b/website/docs/user-guide/cpp/api-reference.md new file mode 100644 index 00000000..07a8b071 --- /dev/null +++ b/website/docs/user-guide/cpp/api-reference.md @@ -0,0 +1,494 @@ +--- +sidebar_position: 2 +--- +# API Reference + +Complete API reference for the Fluss C++ client. + +## `Result` + +| Field / Method | Type | Description | +|-----------------|---------------|----------------------------------------------------------------| +| `error_code` | `int32_t` | 0 for success, non-zero for errors | +| `error_message` | `std::string` | Human-readable error description | +| `Ok()` | `bool` | Returns `true` if operation succeeded (`error_code == 0`) | + +## `Configuration` + +| Field | Type | Default | Description | +|-----------------------------------|---------------|----------------------|-----------------------------------------------------------------| +| `bootstrap_servers` | `std::string` | `"127.0.0.1:9123"` | Coordinator server address | +| `writer_request_max_size` | `int32_t` | `10485760` (10 MB) | Maximum request size in bytes | +| `writer_acks` | `std::string` | `"all"` | Acknowledgment setting (`"all"`, `"0"`, `"1"`, or `"-1"`) | +| `writer_retries` | `int32_t` | `INT32_MAX` | Number of retries on failure | +| `writer_batch_size` | `int32_t` | `2097152` (2 MB) | Batch size for writes in bytes | +| `scanner_remote_log_prefetch_num` | `size_t` | `4` | Number of remote log segments to prefetch | +| `remote_file_download_thread_num` | `size_t` | `3` | Number of threads for remote log downloads | + +## `Connection` + +| Method | Description | +|-------------------------------------------------------------------------|---------------------------------------------------| +| `static Create(const Configuration& config, Connection& out) -> Result` | Create a connection to a Fluss cluster | +| `GetAdmin(Admin& out) -> Result` | Get the admin interface | +| `GetTable(const TablePath& table_path, Table& out) -> Result` | Get a table for read/write operations | +| `Available() -> bool` | Check if the connection is valid and initialized | + +## `Admin` + +### Database Operations + +| Method | Description | +|---------------------------------------------------------------------------------------------------------------------------|--------------------------| +| `CreateDatabase(const std::string& database_name, const DatabaseDescriptor& descriptor, bool ignore_if_exists) -> Result` | Create a database | +| `DropDatabase(const std::string& name, bool ignore_if_not_exists, bool cascade) -> Result` | Drop a database | +| `ListDatabases(std::vector& out) -> Result` | List all databases | +| `DatabaseExists(const std::string& name, bool& out) -> Result` | Check if a database exists | +| `GetDatabaseInfo(const std::string& name, DatabaseInfo& out) -> Result` | Get database metadata | + +### Table Operations + +| Method | Description | +|------------------------------------------------------------------------------------------------------------|-----------------------------| +| `CreateTable(const TablePath& path, const TableDescriptor& descriptor, bool ignore_if_exists) -> Result` | Create a table | +| `DropTable(const TablePath& path, bool ignore_if_not_exists) -> Result` | Drop a table | +| `GetTableInfo(const TablePath& path, TableInfo& out) -> Result` | Get table metadata | +| `ListTables(const std::string& database_name, std::vector& out) -> Result` | List tables in a database | +| `TableExists(const TablePath& path, bool& out) -> Result` | Check if a table exists | + +### Partition Operations + +| Method | Description | +|-------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------| +| `CreatePartition(const TablePath& path, const std::unordered_map& partition_spec, bool ignore_if_exists) -> Result` | Create a partition | +| `DropPartition(const TablePath& path, const std::unordered_map& partition_spec, bool ignore_if_not_exists) -> Result` | Drop a partition | +| `ListPartitionInfos(const TablePath& path, std::vector& out) -> Result` | List partition metadata | + +### Offset Operations + +| Method | Description | +|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------| +| `ListOffsets(const TablePath& path, const std::vector& bucket_ids, const OffsetQuery& query, std::unordered_map& out) -> Result` | Get offsets for buckets | +| `ListPartitionOffsets(const TablePath& path, const std::string& partition_name, const std::vector& bucket_ids, const OffsetQuery& query, std::unordered_map& out) -> Result` | Get offsets for a partition's buckets | + +### Lake Operations + +| Method | Description | +|-----------------------------------------------------------------------------|------------------------------| +| `GetLatestLakeSnapshot(const TablePath& path, LakeSnapshot& out) -> Result` | Get the latest lake snapshot | + +## `Table` + +| Method | Description | +|-------------------------------|------------------------------------------| +| `NewRow() -> GenericRow` | Create a schema-aware row for this table | +| `NewAppend() -> TableAppend` | Create an append builder for log tables | +| `NewUpsert() -> TableUpsert` | Create an upsert builder for PK tables | +| `NewLookup() -> TableLookup` | Create a lookup builder for PK tables | +| `NewScan() -> TableScan` | Create a scan builder | +| `GetTableInfo() -> TableInfo` | Get table metadata | +| `GetTablePath() -> TablePath` | Get the table path | +| `HasPrimaryKey() -> bool` | Check if the table has a primary key | + +## `TableAppend` + +| Method | Description | +|----------------------------------------------|-------------------------| +| `CreateWriter(AppendWriter& out) -> Result` | Create an append writer | + +## `TableUpsert` + +| Method | Description | +|------------------------------------------------------------------------------|--------------------------------------------| +| `PartialUpdateByIndex(std::vector column_indices) -> TableUpsert&` | Configure partial update by column indices | +| `PartialUpdateByName(std::vector column_names) -> TableUpsert&` | Configure partial update by column names | +| `CreateWriter(UpsertWriter& out) -> Result` | Create an upsert writer | + +## `TableLookup` + +| Method | Description | +|-------------------------------------------|-------------------------------------| +| `CreateLookuper(Lookuper& out) -> Result` | Create a lookuper for point lookups | + +## `TableScan` + +| Method | Description | +|----------------------------------------------------------------------|-----------------------------------------------| +| `ProjectByIndex(std::vector column_indices) -> TableScan&` | Project columns by index | +| `ProjectByName(std::vector column_names) -> TableScan&` | Project columns by name | +| `CreateLogScanner(LogScanner& out) -> Result` | Create a record-based log scanner | +| `CreateRecordBatchLogScanner(LogScanner& out) -> Result` | Create an Arrow RecordBatch-based log scanner | + +## `AppendWriter` + +| Method | Description | +|-------------------------------------------------------------|----------------------------------------| +| `Append(const GenericRow& row) -> Result` | Append a row (fire-and-forget) | +| `Append(const GenericRow& row, WriteResult& out) -> Result` | Append a row with write acknowledgment | +| `Flush() -> Result` | Flush all pending writes | + +## `UpsertWriter` + +| Method | Description | +|-------------------------------------------------------------|-----------------------------------------------| +| `Upsert(const GenericRow& row) -> Result` | Upsert a row (fire-and-forget) | +| `Upsert(const GenericRow& row, WriteResult& out) -> Result` | Upsert a row with write acknowledgment | +| `Delete(const GenericRow& row) -> Result` | Delete a row by primary key (fire-and-forget) | +| `Delete(const GenericRow& row, WriteResult& out) -> Result` | Delete a row with write acknowledgment | +| `Flush() -> Result` | Flush all pending operations | + +## `WriteResult` + +| Method | Description | +|--------------------|---------------------------------------------| +| `Wait() -> Result` | Wait for server acknowledgment of the write | + +## `Lookuper` + +| Method | Description | +|----------------------------------------------------------------------------|-----------------------------| +| `Lookup(const GenericRow& pk_row, bool& found, GenericRow& out) -> Result` | Lookup a row by primary key | + +## `LogScanner` + +| Method | Description | +|------------------------------------------------------------------------------------------------------|-------------------------------------------| +| `Subscribe(int32_t bucket_id, int64_t offset) -> Result` | Subscribe to a single bucket at an offset | +| `Subscribe(const std::vector& bucket_offsets) -> Result` | Subscribe to multiple buckets | +| `SubscribePartitionBuckets(int64_t partition_id, int32_t bucket_id, int64_t start_offset) -> Result` | Subscribe to a single partition bucket | +| `SubscribePartitionBuckets(const std::vector& subscriptions) -> Result` | Subscribe to multiple partition buckets | +| `Unsubscribe(int32_t bucket_id) -> Result` | Unsubscribe from a non-partitioned bucket | +| `UnsubscribePartition(int64_t partition_id, int32_t bucket_id) -> Result` | Unsubscribe from a partition bucket | +| `Poll(int64_t timeout_ms, ScanRecords& out) -> Result` | Poll individual records | +| `PollRecordBatch(int64_t timeout_ms, ArrowRecordBatches& out) -> Result` | Poll Arrow RecordBatches | + +## `GenericRow` + +### Index-Based Getters + +| Method | Description | +|------------------------------------------------|--------------------------------| +| `GetBool(size_t idx) -> bool` | Get boolean value at index | +| `GetInt32(size_t idx) -> int32_t` | Get 32-bit integer at index | +| `GetInt64(size_t idx) -> int64_t` | Get 64-bit integer at index | +| `GetFloat32(size_t idx) -> float` | Get 32-bit float at index | +| `GetFloat64(size_t idx) -> double` | Get 64-bit float at index | +| `GetString(size_t idx) -> std::string` | Get string at index | +| `GetBytes(size_t idx) -> std::vector` | Get binary data at index | +| `GetDate(size_t idx) -> Date` | Get date at index | +| `GetTime(size_t idx) -> Time` | Get time at index | +| `GetTimestamp(size_t idx) -> Timestamp` | Get timestamp at index | +| `DecimalToString(size_t idx) -> std::string` | Get decimal as string at index | + +### Index-Based Setters + +| Method | Description | +|-----------------------------------------------------------|--------------------------------| +| `SetNull(size_t idx)` | Set field to null | +| `SetBool(size_t idx, bool value)` | Set boolean value | +| `SetInt32(size_t idx, int32_t value)` | Set 32-bit integer | +| `SetInt64(size_t idx, int64_t value)` | Set 64-bit integer | +| `SetFloat32(size_t idx, float value)` | Set 32-bit float | +| `SetFloat64(size_t idx, double value)` | Set 64-bit float | +| `SetString(size_t idx, const std::string& value)` | Set string value | +| `SetBytes(size_t idx, const std::vector& value)` | Set binary data | +| `SetDate(size_t idx, const Date& value)` | Set date value | +| `SetTime(size_t idx, const Time& value)` | Set time value | +| `SetTimestampNtz(size_t idx, const Timestamp& value)` | Set timestamp without timezone | +| `SetTimestampLtz(size_t idx, const Timestamp& value)` | Set timestamp with timezone | +| `SetDecimal(size_t idx, const std::string& value)` | Set decimal from string | + +### Name-Based Setters + +When using `table.NewRow()`, the `Set()` method auto-routes to the correct type based on the schema: + +| Method | Description | +|----------------------------------------------------------|-----------------------------------| +| `Set(const std::string& name, bool value)` | Set boolean by column name | +| `Set(const std::string& name, int32_t value)` | Set integer by column name | +| `Set(const std::string& name, int64_t value)` | Set big integer by column name | +| `Set(const std::string& name, float value)` | Set float by column name | +| `Set(const std::string& name, double value)` | Set double by column name | +| `Set(const std::string& name, const std::string& value)` | Set string/decimal by column name | +| `Set(const std::string& name, const Date& value)` | Set date by column name | +| `Set(const std::string& name, const Time& value)` | Set time by column name | +| `Set(const std::string& name, const Timestamp& value)` | Set timestamp by column name | + +### Row Inspection + +| Method | Description | +|------------------------------------|----------------------------------| +| `FieldCount() -> size_t` | Get the number of fields | +| `GetType(size_t idx) -> DatumType` | Get the datum type at index | +| `IsNull(size_t idx) -> bool` | Check if field is null | +| `IsDecimal(size_t idx) -> bool` | Check if field is a decimal type | + +## `ScanRecord` + +| Field | Type | Description | +|-------------|--------------|-------------------------------| +| `bucket_id` | `int32_t` | Bucket this record belongs to | +| `offset` | `int64_t` | Record offset in the log | +| `timestamp` | `int64_t` | Record timestamp | +| `row` | `GenericRow` | Row data | + +## `ScanRecords` + +| Method | Description | +|-----------------------------------------------|--------------------------------------------| +| `Size() -> size_t` | Number of records | +| `Empty() -> bool` | Check if empty | +| `operator[](size_t idx) -> const ScanRecord&` | Access record by index | +| `begin() / end()` | Iterator support for range-based for loops | + +## `ArrowRecordBatch` + +| Method | Description | +|----------------------------------------------------------------|--------------------------------------| +| `GetArrowRecordBatch() -> std::shared_ptr` | Get the underlying Arrow RecordBatch | +| `NumRows() -> int64_t` | Number of rows in the batch | +| `GetTableId() -> int64_t` | Table ID | +| `GetPartitionId() -> int64_t` | Partition ID | +| `GetBucketId() -> int32_t` | Bucket ID | +| `GetBaseOffset() -> int64_t` | First record offset | +| `GetLastOffset() -> int64_t` | Last record offset | + +## `ArrowRecordBatches` + +| Method | Description | +|--------------------------|--------------------------------------------| +| `Size() -> size_t` | Number of batches | +| `Empty() -> bool` | Check if empty | +| `operator[](size_t idx)` | Access batch by index | +| `begin() / end()` | Iterator support for range-based for loops | + +## `Schema` + +| Method | Description | +|-----------------------------------|-----------------------------| +| `NewBuilder() -> Schema::Builder` | Create a new schema builder | + +## `Schema::Builder` + +| Method | Description | +|------------------------------------------------------------------------|-------------------------| +| `AddColumn(const std::string& name, const DataType& type) -> Builder&` | Add a column | +| `SetPrimaryKeys(const std::vector& keys) -> Builder&` | Set primary key columns | +| `Build() -> Schema` | Build the schema | + +## `TableDescriptor` + +| Method | Description | +|--------------------------------------------|---------------------------------------| +| `NewBuilder() -> TableDescriptor::Builder` | Create a new table descriptor builder | + +## `TableDescriptor::Builder` + +| Method | Description | +|-----------------------------------------------------------------------------|----------------------------| +| `SetSchema(const Schema& schema) -> Builder&` | Set the table schema | +| `SetPartitionKeys(const std::vector& keys) -> Builder&` | Set partition key columns | +| `SetBucketCount(int32_t count) -> Builder&` | Set the number of buckets | +| `SetBucketKeys(const std::vector& keys) -> Builder&` | Set bucket key columns | +| `SetProperty(const std::string& key, const std::string& value) -> Builder&` | Set a table property | +| `SetComment(const std::string& comment) -> Builder&` | Set a table comment | +| `Build() -> TableDescriptor` | Build the table descriptor | + +## `DataType` + +### Factory Methods + +| Method | Description | +|-----------------------------------------------|------------------------------------| +| `DataType::Boolean()` | Boolean type | +| `DataType::TinyInt()` | 8-bit signed integer | +| `DataType::SmallInt()` | 16-bit signed integer | +| `DataType::Int()` | 32-bit signed integer | +| `DataType::BigInt()` | 64-bit signed integer | +| `DataType::Float()` | 32-bit floating point | +| `DataType::Double()` | 64-bit floating point | +| `DataType::String()` | UTF-8 string | +| `DataType::Bytes()` | Binary data | +| `DataType::Date()` | Date (days since epoch) | +| `DataType::Time()` | Time (milliseconds since midnight) | +| `DataType::Timestamp(int precision)` | Timestamp without timezone | +| `DataType::TimestampLtz(int precision)` | Timestamp with timezone | +| `DataType::Decimal(int precision, int scale)` | Decimal with precision and scale | + +### Accessors + +| Method | Description | +|----------------------|---------------------------------------------| +| `id() -> TypeId` | Get the type ID | +| `precision() -> int` | Get precision (for Decimal/Timestamp types) | +| `scale() -> int` | Get scale (for Decimal type) | + +## `TablePath` + +| Method / Field | Description | +|--------------------------------------------------------------------|-----------------------| +| `TablePath(const std::string& database, const std::string& table)` | Create a table path | +| `database_name -> std::string` | Database name | +| `table_name -> std::string` | Table name | +| `ToString() -> std::string` | String representation | + +## `TableInfo` + +| Field | Type | Description | +|-------------------|------------------------------------------------|-------------------------------------| +| `table_id` | `int64_t` | Table ID | +| `schema_id` | `int32_t` | Schema ID | +| `table_path` | `TablePath` | Table path | +| `created_time` | `int64_t` | Creation timestamp | +| `modified_time` | `int64_t` | Last modification timestamp | +| `primary_keys` | `std::vector` | Primary key columns | +| `bucket_keys` | `std::vector` | Bucket key columns | +| `partition_keys` | `std::vector` | Partition key columns | +| `num_buckets` | `int32_t` | Number of buckets | +| `has_primary_key` | `bool` | Whether the table has a primary key | +| `is_partitioned` | `bool` | Whether the table is partitioned | +| `properties` | `std::unordered_map` | Table properties | +| `comment` | `std::string` | Table comment | +| `schema` | `Schema` | Table schema | + +## Temporal Types + +### `Date` + +| Method | Description | +|-----------------------------------------------|------------------------------| +| `Date::FromDays(int32_t days)` | Create from days since epoch | +| `Date::FromYMD(int year, int month, int day)` | Create from year, month, day | +| `Year() -> int` | Get year | +| `Month() -> int` | Get month | +| `Day() -> int` | Get day | + +### `Time` + +| Method | Description | +|---------------------------------------------------|----------------------------------------------| +| `Time::FromMillis(int32_t millis)` | Create from milliseconds since midnight | +| `Time::FromHMS(int hour, int minute, int second)` | Create from hour, minute, second | +| `Hour() -> int` | Get hour | +| `Minute() -> int` | Get minute | +| `Second() -> int` | Get second | +| `Millis() -> int64_t` | Get sub-second millisecond component (0-999) | + +### `Timestamp` + +| Method | Description | +|----------------------------------------------------------------------|------------------------------------------| +| `Timestamp::FromMillis(int64_t millis)` | Create from milliseconds since epoch | +| `Timestamp::FromMillisNanos(int64_t millis, int32_t nanos)` | Create from milliseconds and nanoseconds | +| `Timestamp::FromTimePoint(std::chrono::system_clock::time_point tp)` | Create from a time point | + +## `PartitionInfo` + +| Field | Type | Description | +|------------------|---------------|----------------| +| `partition_id` | `int64_t` | Partition ID | +| `partition_name` | `std::string` | Partition name | + +## `DatabaseDescriptor` + +| Field | Type | Description | +|--------------|------------------------------------------------|-------------------| +| `comment` | `std::string` | Database comment | +| `properties` | `std::unordered_map` | Custom properties | + +## `DatabaseInfo` + +| Field | Type | Description | +|-----------------|------------------------------------------------|-----------------------------| +| `database_name` | `std::string` | Database name | +| `comment` | `std::string` | Database comment | +| `properties` | `std::unordered_map` | Custom properties | +| `created_time` | `int64_t` | Creation timestamp | +| `modified_time` | `int64_t` | Last modification timestamp | + +## `LakeSnapshot` + +| Field | Type | Description | +|------------------|-----------------------------|--------------------| +| `snapshot_id` | `int64_t` | Snapshot ID | +| `bucket_offsets` | `std::vector` | All bucket offsets | + +## `BucketOffset` + +| Field | Type | Description | +|----------------|-----------|--------------| +| `table_id` | `int64_t` | Table ID | +| `partition_id` | `int64_t` | Partition ID | +| `bucket_id` | `int32_t` | Bucket ID | +| `offset` | `int64_t` | Offset value | + +## `OffsetQuery` + +| Method | Description | +|----------------------------------------------------|-----------------------------------------| +| `OffsetQuery::Earliest()` | Query for the earliest available offset | +| `OffsetQuery::Latest()` | Query for the latest offset | +| `OffsetQuery::FromTimestamp(int64_t timestamp_ms)` | Query offset at a specific timestamp | + +## Constants + +| Constant | Value | Description | +|--------------------------|--------|---------------------------------------------------------| +| `fluss::EARLIEST_OFFSET` | `-2` | Start reading from the earliest available offset | + +To start reading from the latest offset (only new records), resolve the current offset via `ListOffsets` before subscribing: + +```cpp +std::unordered_map offsets; +admin.ListOffsets(table_path, {0}, fluss::OffsetQuery::Latest(), offsets); +scanner.Subscribe(0, offsets[0]); +``` + +## Enums + +### `TypeId` + +| Value | Description | +|----------------|----------------------------| +| `Boolean` | Boolean type | +| `TinyInt` | 8-bit signed integer | +| `SmallInt` | 16-bit signed integer | +| `Int` | 32-bit signed integer | +| `BigInt` | 64-bit signed integer | +| `Float` | 32-bit floating point | +| `Double` | 64-bit floating point | +| `String` | UTF-8 string | +| `Bytes` | Binary data | +| `Date` | Date | +| `Time` | Time | +| `Timestamp` | Timestamp without timezone | +| `TimestampLtz` | Timestamp with timezone | +| `Decimal` | Decimal | + +### `DatumType` + +| Value | C++ Type | Description | +|-----------------|------------------------|---------------------------------| +| `Null` | -- | Null value | +| `Bool` | `bool` | Boolean | +| `Int32` | `int32_t` | 32-bit integer | +| `Int64` | `int64_t` | 64-bit integer | +| `Float32` | `float` | 32-bit float | +| `Float64` | `double` | 64-bit float | +| `String` | `std::string` | String | +| `Bytes` | `std::vector` | Binary data | +| `DecimalI64` | `int64_t` | Decimal (64-bit internal) | +| `DecimalI128` | `__int128` | Decimal (128-bit internal) | +| `DecimalString` | `std::string` | Decimal (string representation) | +| `Date` | `Date` | Date | +| `Time` | `Time` | Time | +| `TimestampNtz` | `Timestamp` | Timestamp without timezone | +| `TimestampLtz` | `Timestamp` | Timestamp with timezone | + +### `OffsetSpec` + +| Value | Description | +|-------------|--------------------------------| +| `Earliest` | Earliest available offset | +| `Latest` | Latest offset | +| `Timestamp` | Offset at a specific timestamp | diff --git a/website/docs/user-guide/cpp/data-types.md b/website/docs/user-guide/cpp/data-types.md new file mode 100644 index 00000000..65e6e4a4 --- /dev/null +++ b/website/docs/user-guide/cpp/data-types.md @@ -0,0 +1,109 @@ +--- +sidebar_position: 3 +--- +# Data Types + +## Schema DataTypes + +| DataType | Description | +|----------------------------|------------------------------------| +| `DataType::Boolean()` | Boolean value | +| `DataType::TinyInt()` | 8-bit signed integer | +| `DataType::SmallInt()` | 16-bit signed integer | +| `DataType::Int()` | 32-bit signed integer | +| `DataType::BigInt()` | 64-bit signed integer | +| `DataType::Float()` | 32-bit floating point | +| `DataType::Double()` | 64-bit floating point | +| `DataType::String()` | UTF-8 string | +| `DataType::Bytes()` | Binary data | +| `DataType::Date()` | Date (days since epoch) | +| `DataType::Time()` | Time (milliseconds since midnight) | +| `DataType::Timestamp()` | Timestamp without timezone | +| `DataType::TimestampLtz()` | Timestamp with timezone | +| `DataType::Decimal(p, s)` | Decimal with precision and scale | + +## GenericRow Setters + +```cpp +fluss::GenericRow row; +row.SetNull(0); +row.SetBool(1, true); +row.SetInt32(2, 42); +row.SetInt64(3, 1234567890L); +row.SetFloat32(4, 3.14f); +row.SetFloat64(5, 2.71828); +row.SetString(6, "hello"); +row.SetBytes(7, {0x01, 0x02, 0x03}); +``` + +## Name-Based Setters + +When using `table.NewRow()`, you can set fields by column name. The setter automatically routes to the correct type based on the schema: + +```cpp +auto row = table.NewRow(); +row.Set("user_id", 1); +row.Set("name", "Alice"); +row.Set("score", 95.5f); +row.Set("balance", "1234.56"); // decimal as string +row.Set("birth_date", fluss::Date::FromYMD(1990, 3, 15)); +row.Set("login_time", fluss::Time::FromHMS(9, 30, 0)); +row.Set("created_at", fluss::Timestamp::FromMillis(1700000000000)); +``` + +## GenericRow Getters + +```cpp +std::string name = result_row.GetString(1); +float score = result_row.GetFloat32(3); +std::string balance = result_row.DecimalToString(4); +fluss::Date date = result_row.GetDate(5); +fluss::Time time = result_row.GetTime(6); +fluss::Timestamp ts = result_row.GetTimestamp(7); +``` + +## DatumType Enum + +| DatumType | C++ Type | Getter | +|-----------------|------------------------|------------------------| +| `Null` | -- | `IsNull(idx)` | +| `Bool` | `bool` | `GetBool(idx)` | +| `Int32` | `int32_t` | `GetInt32(idx)` | +| `Int64` | `int64_t` | `GetInt64(idx)` | +| `Float32` | `float` | `GetFloat32(idx)` | +| `Float64` | `double` | `GetFloat64(idx)` | +| `String` | `std::string` | `GetString(idx)` | +| `Bytes` | `std::vector` | `GetBytes(idx)` | +| `Date` | `Date` | `GetDate(idx)` | +| `Time` | `Time` | `GetTime(idx)` | +| `TimestampNtz` | `Timestamp` | `GetTimestamp(idx)` | +| `TimestampLtz` | `Timestamp` | `GetTimestamp(idx)` | +| `DecimalString` | `std::string` | `DecimalToString(idx)` | + +## Type Checking + +```cpp +if (rec.row.GetType(0) == fluss::DatumType::Int32) { + int32_t value = rec.row.GetInt32(0); +} +if (rec.row.IsNull(1)) { + // field is null +} +if (rec.row.IsDecimal(2)) { + std::string decimal_str = rec.row.DecimalToString(2); +} +``` + +## Constants + +```cpp +constexpr int64_t fluss::EARLIEST_OFFSET = -2; // Start from earliest +``` + +To start reading from the latest offset, resolve the current offset via `ListOffsets` before subscribing: + +```cpp +std::unordered_map offsets; +admin.ListOffsets(table_path, {0}, fluss::OffsetQuery::Latest(), offsets); +scanner.Subscribe(0, offsets[0]); +``` diff --git a/website/docs/user-guide/cpp/error-handling.md b/website/docs/user-guide/cpp/error-handling.md new file mode 100644 index 00000000..e1ec058e --- /dev/null +++ b/website/docs/user-guide/cpp/error-handling.md @@ -0,0 +1,128 @@ +--- +sidebar_position: 4 +--- +# Error Handling + +All C++ client operations return a `fluss::Result` struct instead of throwing exceptions. This gives you explicit control over error handling. + +## The `Result` Struct + +```cpp +#include "fluss.hpp" + +// All operations return fluss::Result +fluss::Result result = admin.CreateTable(path, descriptor); +if (!result.Ok()) { + std::cerr << "Error code: " << result.error_code << std::endl; + std::cerr << "Error message: " << result.error_message << std::endl; +} +``` + +| Field / Method | Type | Description | +|------------------|---------------|-------------------------------------------| +| `error_code` | `int32_t` | 0 for success, non-zero for errors | +| `error_message` | `std::string` | Human-readable error description | +| `Ok()` | `bool` | Returns `true` if the operation succeeded | + +## Handling Errors + +Check the `Result` after each operation and decide how to respond, e.g. log and continue, retry, or abort: + +```cpp +fluss::Connection conn; +fluss::Result result = fluss::Connection::Create(config, conn); +if (!result.Ok()) { + // Log, retry, or propagate the error as appropriate + std::cerr << "Connection failed (code " << result.error_code + << "): " << result.error_message << std::endl; + return 1; +} +``` + +## Connection State Checking + +Use `Available()` to verify that a connection or object is valid before using it: + +```cpp +fluss::Connection conn; +if (!conn.Available()) { + // Connection not initialized or already moved +} + +fluss::Configuration config; +config.bootstrap_servers = "127.0.0.1:9123"; +fluss::Result result = fluss::Connection::Create(config, conn); +if (result.Ok() && conn.Available()) { + // Connection is ready to use +} +``` + +## Common Error Scenarios + +### Connection Refused + +The cluster is not running or the address is incorrect: + +```cpp +fluss::Configuration config; +config.bootstrap_servers = "127.0.0.1:9123"; +fluss::Connection conn; +fluss::Result result = fluss::Connection::Create(config, conn); +if (!result.Ok()) { + // "Connection refused" or timeout error + std::cerr << "Cannot connect to cluster: " << result.error_message << std::endl; +} +``` + +### Table Not Found + +Attempting to access a table that does not exist: + +```cpp +fluss::Table table; +fluss::Result result = conn.GetTable(fluss::TablePath("fluss", "nonexistent"), table); +if (!result.Ok()) { + // Table not found error + std::cerr << "Table error: " << result.error_message << std::endl; +} +``` + +### Partition Not Found + +Writing to a partitioned primary key table before creating partitions: + +```cpp +// This will fail if partitions are not created first +auto row = table.NewRow(); +row.Set("user_id", 1); +row.Set("region", "US"); +row.Set("score", static_cast(100)); +fluss::WriteResult wr; +fluss::Result result = writer.Upsert(row, wr); +if (!result.Ok()) { + // Partition not found, create partitions before writing + std::cerr << "Write error: " << result.error_message << std::endl; +} +``` + +### Schema Mismatch + +Using incorrect types or column indices when writing: + +```cpp +fluss::GenericRow row; +// Setting wrong type for a column will result in an error +// when the row is sent to the server +row.SetString(0, "not_an_integer"); // Column 0 expects Int +fluss::Result result = writer.Append(row); +if (!result.Ok()) { + std::cerr << "Schema mismatch: " << result.error_message << std::endl; +} +``` + +## Best Practices + +1. **Always check `Result`**: Never ignore the return value of operations that return `Result`. +2. **Handle errors gracefully**: Log errors and retry or fail gracefully rather than crashing. +3. **Verify connection state**: Use `Available()` to check connection validity before operations. +4. **Create partitions before writing**: For partitioned primary key tables, always create partitions before attempting upserts. diff --git a/website/docs/user-guide/cpp/example/_category_.json b/website/docs/user-guide/cpp/example/_category_.json new file mode 100644 index 00000000..dd222949 --- /dev/null +++ b/website/docs/user-guide/cpp/example/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Example", + "position": 5 +} diff --git a/website/docs/user-guide/cpp/example/admin-operations.md b/website/docs/user-guide/cpp/example/admin-operations.md new file mode 100644 index 00000000..c27dc209 --- /dev/null +++ b/website/docs/user-guide/cpp/example/admin-operations.md @@ -0,0 +1,107 @@ +--- +sidebar_position: 3 +--- +# Admin Operations + +## Get Admin Interface + +```cpp +fluss::Admin admin; +conn.GetAdmin(admin); +``` + +## Table Operations + +```cpp +fluss::TablePath table_path("fluss", "my_table"); + +auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .AddColumn("score", fluss::DataType::Float()) + .AddColumn("age", fluss::DataType::Int()) + .Build(); + +auto descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetBucketCount(3) + .SetComment("Example table") + .Build(); + +// Create table +admin.CreateTable(table_path, descriptor, true); + +// Get table information +fluss::TableInfo table_info; +admin.GetTableInfo(table_path, table_info); +std::cout << "Table ID: " << table_info.table_id << std::endl; +std::cout << "Number of buckets: " << table_info.num_buckets << std::endl; +std::cout << "Has primary key: " << table_info.has_primary_key << std::endl; +std::cout << "Is partitioned: " << table_info.is_partitioned << std::endl; + +// Drop table +admin.DropTable(table_path, true); +``` + +## Schema Builder Options + +```cpp +// Schema with primary key +auto pk_schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .AddColumn("value", fluss::DataType::Double()) + .SetPrimaryKeys({"id"}) + .Build(); + +// Table descriptor with partitioning +auto descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetPartitionKeys({"date"}) + .SetBucketCount(3) + .SetBucketKeys({"user_id"}) + .SetProperty("retention_days", "7") + .SetComment("Sample table") + .Build(); +``` + +## Offset Operations + +```cpp +std::vector bucket_ids = {0, 1, 2}; + +// Query earliest offsets +std::unordered_map earliest_offsets; +admin.ListOffsets(table_path, bucket_ids, + fluss::OffsetQuery::Earliest(), earliest_offsets); + +// Query latest offsets +std::unordered_map latest_offsets; +admin.ListOffsets(table_path, bucket_ids, + fluss::OffsetQuery::Latest(), latest_offsets); + +// Query offsets for a specific timestamp +std::unordered_map timestamp_offsets; +admin.ListOffsets(table_path, bucket_ids, + fluss::OffsetQuery::FromTimestamp(timestamp_ms), + timestamp_offsets); + +// Query partition offsets +std::unordered_map partition_offsets; +admin.ListPartitionOffsets(table_path, "partition_name", + bucket_ids, fluss::OffsetQuery::Latest(), + partition_offsets); +``` + +## Lake Snapshot + +```cpp +fluss::LakeSnapshot snapshot; +admin.GetLatestLakeSnapshot(table_path, snapshot); +std::cout << "Snapshot ID: " << snapshot.snapshot_id << std::endl; +for (const auto& bucket_offset : snapshot.bucket_offsets) { + std::cout << " Table " << bucket_offset.table_id + << ", Bucket " << bucket_offset.bucket_id + << ": offset=" << bucket_offset.offset << std::endl; +} +``` diff --git a/website/docs/user-guide/cpp/example/configuration.md b/website/docs/user-guide/cpp/example/configuration.md new file mode 100644 index 00000000..c4fc6678 --- /dev/null +++ b/website/docs/user-guide/cpp/example/configuration.md @@ -0,0 +1,35 @@ +--- +sidebar_position: 2 +--- +# Configuration + +## Connection Setup + +```cpp +#include "fluss.hpp" + +fluss::Configuration config; +config.bootstrap_servers = "127.0.0.1:9123"; + +fluss::Connection conn; +fluss::Result result = fluss::Connection::Create(config, conn); + +if (!result.Ok()) { + std::cerr << "Connection failed: " << result.error_message << std::endl; +} +``` + +## Configuration Options + +All fields have sensible defaults. Only `bootstrap_servers` typically needs to be set. + +```cpp +fluss::Configuration config; +config.bootstrap_servers = "127.0.0.1:9123"; // Coordinator address +config.writer_request_max_size = 10 * 1024 * 1024; // Max request size (10 MB) +config.writer_acks = "all"; // Wait for all replicas +config.writer_retries = std::numeric_limits::max(); // Retry on failure +config.writer_batch_size = 2 * 1024 * 1024; // Batch size (2 MB) +config.scanner_remote_log_prefetch_num = 4; // Remote log prefetch count +config.remote_file_download_thread_num = 3; // Download threads +``` diff --git a/website/docs/user-guide/cpp/example/index.md b/website/docs/user-guide/cpp/example/index.md new file mode 100644 index 00000000..51f60e41 --- /dev/null +++ b/website/docs/user-guide/cpp/example/index.md @@ -0,0 +1,63 @@ +--- +sidebar_position: 1 +--- +# Example + +Minimal working example: connect to Fluss, create a table, write data, and read it back. + +```cpp +#include +#include "fluss.hpp" + +int main() { + // Connect + fluss::Configuration config; + config.bootstrap_servers = "127.0.0.1:9123"; + + fluss::Connection conn; + fluss::Connection::Create(config, conn); + + fluss::Admin admin; + conn.GetAdmin(admin); + + // Create a log table + fluss::TablePath table_path("fluss", "quickstart_cpp"); + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .Build(); + auto descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .Build(); + admin.CreateTable(table_path, descriptor, true); + + // Write + fluss::Table table; + conn.GetTable(table_path, table); + + fluss::AppendWriter writer; + table.NewAppend().CreateWriter(writer); + + fluss::GenericRow row; + row.SetInt32(0, 1); + row.SetString(1, "hello"); + writer.Append(row); + writer.Flush(); + + // Read + fluss::LogScanner scanner; + table.NewScan().CreateLogScanner(scanner); + auto info = table.GetTableInfo(); + for (int b = 0; b < info.num_buckets; ++b) { + scanner.Subscribe(b, 0); + } + fluss::ScanRecords records; + scanner.Poll(5000, records); + for (const auto& rec : records) { + std::cout << "id=" << rec.row.GetInt32(0) + << ", name=" << rec.row.GetString(1) << std::endl; + } + + return 0; +} +``` diff --git a/website/docs/user-guide/cpp/example/log-tables.md b/website/docs/user-guide/cpp/example/log-tables.md new file mode 100644 index 00000000..c94bb845 --- /dev/null +++ b/website/docs/user-guide/cpp/example/log-tables.md @@ -0,0 +1,121 @@ +--- +sidebar_position: 4 +--- +# Log Tables + +Log tables are append-only tables without primary keys, suitable for event streaming. + +## Creating a Log Table + +```cpp +auto schema = fluss::Schema::NewBuilder() + .AddColumn("event_id", fluss::DataType::Int()) + .AddColumn("event_type", fluss::DataType::String()) + .AddColumn("timestamp", fluss::DataType::BigInt()) + .Build(); + +auto descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .Build(); + +fluss::TablePath table_path("fluss", "events"); +admin.CreateTable(table_path, descriptor, true); +``` + +## Writing to Log Tables + +```cpp +fluss::Table table; +conn.GetTable(table_path, table); + +fluss::AppendWriter writer; +table.NewAppend().CreateWriter(writer); + +fluss::GenericRow row; +row.SetInt32(0, 1); // event_id +row.SetString(1, "user_login"); // event_type +row.SetInt64(2, 1704067200000L); // timestamp +writer.Append(row); + +writer.Flush(); +``` + +## Reading from Log Tables + +```cpp +fluss::LogScanner scanner; +table.NewScan().CreateLogScanner(scanner); + +auto info = table.GetTableInfo(); +for (int b = 0; b < info.num_buckets; ++b) { + scanner.Subscribe(b, 0); +} + +fluss::ScanRecords records; +scanner.Poll(5000, records); // timeout in ms + +for (const auto& rec : records) { + std::cout << "event_id=" << rec.row.GetInt32(0) + << " event_type=" << rec.row.GetString(1) + << " timestamp=" << rec.row.GetInt64(2) + << " @ offset=" << rec.offset << std::endl; +} +``` + +**Batch subscribe:** + +```cpp +std::vector subscriptions; +subscriptions.push_back({0, 0}); // bucket 0, offset 0 +subscriptions.push_back({1, 100}); // bucket 1, offset 100 +scanner.Subscribe(subscriptions); +``` + +**Unsubscribe from a bucket:** + +```cpp +// Stop receiving records from bucket 1 +scanner.Unsubscribe(1); +``` + +**Arrow RecordBatch polling (high performance):** + +```cpp +#include + +fluss::LogScanner arrow_scanner; +table.NewScan().CreateRecordBatchLogScanner(arrow_scanner); + +for (int b = 0; b < info.num_buckets; ++b) { + arrow_scanner.Subscribe(b, 0); +} + +fluss::ArrowRecordBatches batches; +arrow_scanner.PollRecordBatch(5000, batches); + +for (size_t i = 0; i < batches.Size(); ++i) { + const auto& batch = batches[i]; + if (batch->Available()) { + auto arrow_batch = batch->GetArrowRecordBatch(); + std::cout << "Batch " << i << ": " << arrow_batch->num_rows() << " rows" + << ", partition_id=" << batch->GetPartitionId() + << ", bucket_id=" << batch->GetBucketId() << std::endl; + } +} +``` + +## Column Projection + +```cpp +// Project by column index +fluss::LogScanner projected_scanner; +table.NewScan().ProjectByIndex({0, 2}).CreateLogScanner(projected_scanner); + +// Project by column name +fluss::LogScanner name_projected_scanner; +table.NewScan().ProjectByName({"event_id", "timestamp"}).CreateLogScanner(name_projected_scanner); + +// Arrow RecordBatch with projection +fluss::LogScanner projected_arrow_scanner; +table.NewScan().ProjectByIndex({0, 2}).CreateRecordBatchLogScanner(projected_arrow_scanner); +``` diff --git a/website/docs/user-guide/cpp/example/partitioned-tables.md b/website/docs/user-guide/cpp/example/partitioned-tables.md new file mode 100644 index 00000000..6a6927f5 --- /dev/null +++ b/website/docs/user-guide/cpp/example/partitioned-tables.md @@ -0,0 +1,180 @@ +--- +sidebar_position: 6 +--- +# Partitioned Tables + +Partitioned tables distribute data across partitions based on partition column values, enabling efficient data organization and querying. Both log tables and primary key tables support partitioning. + +## Partitioned Log Tables + +### Creating a Partitioned Log Table + +```cpp +auto schema = fluss::Schema::NewBuilder() + .AddColumn("event_id", fluss::DataType::Int()) + .AddColumn("event_type", fluss::DataType::String()) + .AddColumn("dt", fluss::DataType::String()) + .AddColumn("region", fluss::DataType::String()) + .Build(); + +auto descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetPartitionKeys({"dt", "region"}) + .SetBucketCount(3) + .Build(); + +fluss::TablePath table_path("fluss", "partitioned_events"); +admin.CreateTable(table_path, descriptor, true); +``` + +### Writing to Partitioned Log Tables + +**Partitions must exist before writing data, otherwise the client will by default retry indefinitely.** Include partition column values in each row, the client routes records to the correct partition automatically. + +```cpp +fluss::Table table; +conn.GetTable(table_path, table); + +fluss::AppendWriter writer; +table.NewAppend().CreateWriter(writer); + +fluss::GenericRow row; +row.SetInt32(0, 1); +row.SetString(1, "user_login"); +row.SetString(2, "2024-01-15"); +row.SetString(3, "US"); +writer.Append(row); +writer.Flush(); +``` + +### Reading from Partitioned Log Tables + +For partitioned tables, use partition-aware subscribe methods. + +```cpp +fluss::Table table; +conn.GetTable(table_path, table); + +fluss::LogScanner scanner; +table.NewScan().CreateLogScanner(scanner); + +// Subscribe to individual partitions +for (const auto& pi : partition_infos) { + scanner.SubscribePartitionBuckets(pi.partition_id, 0, 0); +} + +fluss::ScanRecords records; +scanner.Poll(5000, records); + +for (const auto& rec : records) { + std::cout << "bucket_id=" << rec.bucket_id + << " offset=" << rec.offset << std::endl; +} + +// Or batch-subscribe to all partitions at once +fluss::LogScanner batch_scanner; +table.NewScan().CreateLogScanner(batch_scanner); + +std::vector subs; +for (const auto& pi : partition_infos) { + subs.push_back({pi.partition_id, 0, 0}); +} +batch_scanner.SubscribePartitionBuckets(subs); +``` + +**Unsubscribe from a partition bucket:** + +```cpp +// Stop receiving records from a specific partition bucket +scanner.UnsubscribePartition(partition_infos[0].partition_id, 0); +``` + +### Managing Partitions + +```cpp +// Create a partition +admin.CreatePartition(table_path, {{"dt", "2024-01-15"}, {"region", "EMEA"}}, true); + +// List partitions +std::vector partition_infos; +admin.ListPartitionInfos(table_path, partition_infos); + +// Query partition offsets +std::vector bucket_ids = {0, 1, 2}; +std::unordered_map offsets; +admin.ListPartitionOffsets(table_path, "2024-01-15$US", + bucket_ids, fluss::OffsetQuery::Latest(), offsets); +``` + +## Partitioned Primary Key Tables + +Partitioned KV tables combine partitioning with primary key operations. Partition columns must be part of the primary key. + +### Creating a Partitioned Primary Key Table + +```cpp +auto schema = fluss::Schema::NewBuilder() + .AddColumn("user_id", fluss::DataType::Int()) + .AddColumn("region", fluss::DataType::String()) + .AddColumn("zone", fluss::DataType::BigInt()) + .AddColumn("score", fluss::DataType::BigInt()) + .SetPrimaryKeys({"user_id", "region", "zone"}) + .Build(); + +auto descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetPartitionKeys({"region", "zone"}) + .SetBucketCount(3) + .Build(); + +fluss::TablePath table_path("fluss", "partitioned_users"); +admin.CreateTable(table_path, descriptor, true); +``` + +### Writing to Partitioned Primary Key Tables + +**Partitions must exist before upserting data, otherwise the client will by default retry indefinitely.** + +```cpp +fluss::Table table; +conn.GetTable(table_path, table); + +// Create partitions first +admin.CreatePartition(table_path, {{"region", "APAC"}, {"zone", "1"}}, true); +admin.CreatePartition(table_path, {{"region", "EMEA"}, {"zone", "2"}}, true); +admin.CreatePartition(table_path, {{"region", "US"}, {"zone", "3"}}, true); + +fluss::UpsertWriter writer; +table.NewUpsert().CreateWriter(writer); + +auto row = table.NewRow(); +row.Set("user_id", 1001); +row.Set("region", "APAC"); +row.Set("zone", static_cast(1)); +row.Set("score", static_cast(1234)); +writer.Upsert(row); +writer.Flush(); +``` + +### Looking Up Records in Partitioned Tables + +Lookup requires all primary key columns including partition columns. + +> **Note:** Scanning partitioned primary key tables is not supported. Use lookup operations instead. + +```cpp +fluss::Lookuper lookuper; +table.NewLookup().CreateLookuper(lookuper); + +auto pk = table.NewRow(); +pk.Set("user_id", 1001); +pk.Set("region", "APAC"); +pk.Set("zone", static_cast(1)); + +bool found = false; +fluss::GenericRow result; +lookuper.Lookup(pk, found, result); +if (found) { + std::cout << "score=" << result.GetInt64(3) << std::endl; +} +``` diff --git a/website/docs/user-guide/cpp/example/primary-key-tables.md b/website/docs/user-guide/cpp/example/primary-key-tables.md new file mode 100644 index 00000000..7aa87e31 --- /dev/null +++ b/website/docs/user-guide/cpp/example/primary-key-tables.md @@ -0,0 +1,133 @@ +--- +sidebar_position: 5 +--- +# Primary Key Tables + +Primary key tables (KV tables) support upsert, delete, and lookup operations. + +## Creating a Primary Key Table + +```cpp +auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .AddColumn("age", fluss::DataType::BigInt()) + .SetPrimaryKeys({"id"}) + .Build(); + +auto descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetBucketCount(3) + .Build(); + +fluss::TablePath table_path("fluss", "users"); +admin.CreateTable(table_path, descriptor, true); +``` + +## Upserting Records + +```cpp +fluss::Table table; +conn.GetTable(table_path, table); + +fluss::UpsertWriter upsert_writer; +table.NewUpsert().CreateWriter(upsert_writer); + +// Fire-and-forget upserts +{ + auto row = table.NewRow(); + row.Set("id", 1); + row.Set("name", "Alice"); + row.Set("age", static_cast(25)); + upsert_writer.Upsert(row); +} +{ + auto row = table.NewRow(); + row.Set("id", 2); + row.Set("name", "Bob"); + row.Set("age", static_cast(30)); + upsert_writer.Upsert(row); +} +upsert_writer.Flush(); + +// Per-record acknowledgment +{ + auto row = table.NewRow(); + row.Set("id", 3); + row.Set("name", "Charlie"); + row.Set("age", static_cast(35)); + fluss::WriteResult wr; + upsert_writer.Upsert(row, wr); + wr.Wait(); +} +``` + +## Updating Records + +Upsert with the same primary key to update an existing record. + +```cpp +auto row = table.NewRow(); +row.Set("id", 1); +row.Set("name", "Alice Updated"); +row.Set("age", static_cast(26)); +fluss::WriteResult wr; +upsert_writer.Upsert(row, wr); +wr.Wait(); +``` + +## Deleting Records + +```cpp +auto pk_row = table.NewRow(); +pk_row.Set("id", 2); +fluss::WriteResult wr; +upsert_writer.Delete(pk_row, wr); +wr.Wait(); +``` + +## Partial Updates + +Update only specific columns while preserving others. + +```cpp +// By column names +fluss::UpsertWriter partial_writer; +table.NewUpsert() + .PartialUpdateByName({"id", "age"}) + .CreateWriter(partial_writer); + +auto row = table.NewRow(); +row.Set("id", 1); +row.Set("age", static_cast(27)); +fluss::WriteResult wr; +partial_writer.Upsert(row, wr); +wr.Wait(); + +// By column indices +fluss::UpsertWriter partial_writer_idx; +table.NewUpsert() + .PartialUpdateByIndex({0, 2}) + .CreateWriter(partial_writer_idx); +``` + +## Looking Up Records + +```cpp +fluss::Lookuper lookuper; +table.NewLookup().CreateLookuper(lookuper); + +auto pk_row = table.NewRow(); +pk_row.Set("id", 1); + +bool found = false; +fluss::GenericRow result_row; +lookuper.Lookup(pk_row, found, result_row); + +if (found) { + std::cout << "Found: name=" << result_row.GetString(1) + << ", age=" << result_row.GetInt64(2) << std::endl; +} else { + std::cout << "Not found" << std::endl; +} +``` diff --git a/website/docs/user-guide/cpp/installation.md b/website/docs/user-guide/cpp/installation.md new file mode 100644 index 00000000..6360da43 --- /dev/null +++ b/website/docs/user-guide/cpp/installation.md @@ -0,0 +1,107 @@ +--- +sidebar_position: 1 +--- +# Installation + +The C++ bindings are not yet published as a package. You need to build from source. + +**Prerequisites:** CMake 3.22+, C++17 compiler, Rust 1.85+, Apache Arrow C++ library + +```bash +git clone https://github.com/apache/fluss-rust.git +cd fluss-rust +``` + +Install dependencies: + +```bash +# macOS +brew install cmake arrow + +# Ubuntu/Debian +sudo apt-get install cmake libarrow-dev +``` + +If Arrow is not available via package manager, build from source: + +```bash +git clone https://github.com/apache/arrow.git +cd arrow/cpp +cmake -B build -DARROW_BUILD_SHARED=ON +cmake --build build +sudo cmake --install build +``` + +Build the C++ bindings: + +```bash +cd bindings/cpp +mkdir -p build && cd build + +# Debug mode +cmake .. + +# Or Release mode +cmake -DCMAKE_BUILD_TYPE=Release .. + +# Build +cmake --build . +``` + +This produces: +- `libfluss_cpp.a` (Static library) +- `fluss_cpp_example` (Example executable) +- Header files in `include/` + +## Integrating into Your Project + +**Option 1: CMake FetchContent** + +```cmake +include(FetchContent) +FetchContent_Declare( + fluss-cpp + GIT_REPOSITORY https://github.com/apache/fluss-rust.git + SOURCE_SUBDIR bindings/cpp +) +FetchContent_MakeAvailable(fluss-cpp) + +target_link_libraries(your_target PRIVATE fluss_cpp) +``` + +**Option 2: Manual Integration** + +Copy the build artifacts and configure CMake: + +```cmake +find_package(Arrow REQUIRED) + +add_library(fluss_cpp STATIC IMPORTED) +set_target_properties(fluss_cpp PROPERTIES + IMPORTED_LOCATION ${CMAKE_SOURCE_DIR}/lib/libfluss_cpp.a + INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_SOURCE_DIR}/include +) + +target_link_libraries(your_target + PRIVATE + fluss_cpp + Arrow::arrow_shared + ${CMAKE_DL_LIBS} + Threads::Threads +) + +# On macOS, also link these frameworks +if(APPLE) + target_link_libraries(your_target PRIVATE + "-framework CoreFoundation" + "-framework Security" + ) +endif() +``` + +**Option 3: Subdirectory** + +```cmake +add_subdirectory(vendor/fluss-rust/bindings/cpp) +target_link_libraries(your_target PRIVATE fluss_cpp) +``` diff --git a/website/docs/user-guide/python/_category_.json b/website/docs/user-guide/python/_category_.json new file mode 100644 index 00000000..a9f34b47 --- /dev/null +++ b/website/docs/user-guide/python/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Python", + "position": 2 +} diff --git a/website/docs/user-guide/python/api-reference.md b/website/docs/user-guide/python/api-reference.md new file mode 100644 index 00000000..99437630 --- /dev/null +++ b/website/docs/user-guide/python/api-reference.md @@ -0,0 +1,281 @@ +--- +sidebar_position: 2 +--- +# API Reference + +Complete API reference for the Fluss Python client. + +## `Config` + +| Method / Property | Description | +|-----------------------------------|----------------------------------------------| +| `Config(properties: dict = None)` | Create config from a dict of key-value pairs | +| `.bootstrap_servers` | Get/set coordinator server address | +| `.writer_request_max_size` | Get/set max request size in bytes | +| `.writer_batch_size` | Get/set write batch size in bytes | + +## `FlussConnection` + +| Method | Description | +|-----------------------------------------------------------|---------------------------------------| +| `await FlussConnection.create(config) -> FlussConnection` | Connect to a Fluss cluster | +| `await conn.get_admin() -> FlussAdmin` | Get admin interface | +| `await conn.get_table(table_path) -> FlussTable` | Get a table for read/write operations | +| `conn.close()` | Close the connection | + +Supports `with` statement (context manager). + +## `FlussAdmin` + +| Method | Description | +|-----------------------------------------------------------------------------------------------------------------------|---------------------------------------| +| `await create_database(name, database_descriptor=None, ignore_if_exists=False)` | Create a database | +| `await drop_database(name, ignore_if_not_exists=False, cascade=True)` | Drop a database | +| `await list_databases() -> list[str]` | List all databases | +| `await database_exists(name) -> bool` | Check if a database exists | +| `await get_database_info(name) -> DatabaseInfo` | Get database metadata | +| `await create_table(table_path, table_descriptor, ignore_if_exists=False)` | Create a table | +| `await drop_table(table_path, ignore_if_not_exists=False)` | Drop a table | +| `await get_table_info(table_path) -> TableInfo` | Get table metadata | +| `await list_tables(database_name) -> list[str]` | List tables in a database | +| `await table_exists(table_path) -> bool` | Check if a table exists | +| `await list_offsets(table_path, bucket_ids, offset_type, timestamp=None) -> dict[int, int]` | Get offsets for buckets | +| `await list_partition_offsets(table_path, partition_name, bucket_ids, offset_type, timestamp=None) -> dict[int, int]` | Get offsets for a partition's buckets | +| `await create_partition(table_path, partition_spec, ignore_if_exists=False)` | Create a partition | +| `await drop_partition(table_path, partition_spec, ignore_if_not_exists=False)` | Drop a partition | +| `await list_partition_infos(table_path) -> list[PartitionInfo]` | List partitions | +| `await get_latest_lake_snapshot(table_path) -> LakeSnapshot` | Get latest lake snapshot | + +## `FlussTable` + +| Method | Description | +|---------------------------------|-----------------------------------------| +| `new_scan() -> TableScan` | Create a scan builder | +| `new_append() -> TableAppend` | Create an append builder for log tables | +| `new_upsert() -> TableUpsert` | Create an upsert builder for PK tables | +| `new_lookup() -> TableLookup` | Create a lookup builder for PK tables | +| `get_table_info() -> TableInfo` | Get table metadata | +| `get_table_path() -> TablePath` | Get table path | +| `has_primary_key() -> bool` | Check if table has a primary key | + +## `TableScan` + +| Method | Description | +|----------------------------------------------------------|---------------------------------------------------------------------| +| `.project(indices) -> TableScan` | Project columns by index | +| `.project_by_name(names) -> TableScan` | Project columns by name | +| `await .create_log_scanner() -> LogScanner` | Create record-based scanner (for `poll()`) | +| `await .create_record_batch_log_scanner() -> LogScanner` | Create batch-based scanner (for `poll_arrow()`, `to_arrow()`, etc.) | + +## `TableAppend` + +Builder for creating an `AppendWriter`. Obtain via `FlussTable.new_append()`. + +| Method | Description | +|------------------------------------|--------------------------| +| `.create_writer() -> AppendWriter` | Create the append writer | + +## `TableUpsert` + +Builder for creating an `UpsertWriter`. Obtain via `FlussTable.new_upsert()`. + +| Method | Description | +|----------------------------------------------------|--------------------------------------------| +| `.partial_update_by_name(columns) -> TableUpsert` | Configure partial update by column names | +| `.partial_update_by_index(indices) -> TableUpsert` | Configure partial update by column indices | +| `.create_writer() -> UpsertWriter` | Create the upsert writer | + +## `TableLookup` + +Builder for creating a `Lookuper`. Obtain via `FlussTable.new_lookup()`. + +| Method | Description | +|----------------------------------|---------------------| +| `.create_lookuper() -> Lookuper` | Create the lookuper | + +## `AppendWriter` + +| Method | Description | +|--------------------------------------------------|-------------------------------------| +| `.append(row) -> WriteResultHandle` | Append a row (dict, list, or tuple) | +| `.write_arrow(table)` | Write a PyArrow Table | +| `.write_arrow_batch(batch) -> WriteResultHandle` | Write a PyArrow RecordBatch | +| `.write_pandas(df)` | Write a Pandas DataFrame | +| `await .flush()` | Flush all pending writes | + +## `UpsertWriter` + +| Method | Description | +|-------------------------------------|---------------------------------------| +| `.upsert(row) -> WriteResultHandle` | Upsert a row (insert or update by PK) | +| `.delete(pk) -> WriteResultHandle` | Delete a row by primary key | +| `await .flush()` | Flush all pending operations | + +## `WriteResultHandle` + +| Method | Description | +|-----------------|----------------------------------------------| +| `await .wait()` | Wait for server acknowledgment of this write | + +## `Lookuper` + +| Method | Description | +|-------------------------------------|-----------------------------| +| `await .lookup(pk) -> dict \| None` | Lookup a row by primary key | + +## `LogScanner` + +| Method | Description | +|---------------------------------------------------------------|----------------------------------------------------------------------------------| +| `.subscribe(bucket_id, start_offset)` | Subscribe to a bucket | +| `.subscribe_buckets(bucket_offsets)` | Subscribe to multiple buckets (`{bucket_id: offset}`) | +| `.subscribe_partition(partition_id, bucket_id, start_offset)` | Subscribe to a partition bucket | +| `.subscribe_partition_buckets(partition_bucket_offsets)` | Subscribe to multiple partition+bucket combos (`{(part_id, bucket_id): offset}`) | +| `.unsubscribe(bucket_id)` | Unsubscribe from a bucket (non-partitioned tables) | +| `.unsubscribe_partition(partition_id, bucket_id)` | Unsubscribe from a partition bucket | +| `.poll(timeout_ms) -> list[ScanRecord]` | Poll individual records (record scanner only) | +| `.poll_arrow(timeout_ms) -> pa.Table` | Poll as Arrow Table (batch scanner only) | +| `.poll_record_batch(timeout_ms) -> list[RecordBatch]` | Poll batches with metadata (batch scanner only) | +| `.to_arrow() -> pa.Table` | Read all subscribed data as Arrow Table (batch scanner only) | +| `.to_pandas() -> pd.DataFrame` | Read all subscribed data as DataFrame (batch scanner only) | + +## `ScanRecord` + +| Property | Description | +|------------------------------|---------------------------------------------------------------------| +| `.bucket -> TableBucket` | Bucket this record belongs to | +| `.offset -> int` | Record offset in the log | +| `.timestamp -> int` | Record timestamp | +| `.change_type -> ChangeType` | Change type (AppendOnly, Insert, UpdateBefore, UpdateAfter, Delete) | +| `.row -> dict` | Row data as `{column_name: value}` | + +## `RecordBatch` + +| Property | Description | +|----------------------------|------------------------------| +| `.batch -> pa.RecordBatch` | Arrow RecordBatch data | +| `.bucket -> TableBucket` | Bucket this batch belongs to | +| `.base_offset -> int` | First record offset | +| `.last_offset -> int` | Last record offset | + +## `Schema` + +| Method | Description | +|------------------------------------------------|----------------------------| +| `Schema(schema: pa.Schema, primary_keys=None)` | Create from PyArrow schema | +| `.get_column_names() -> list[str]` | Get column names | +| `.get_column_types() -> list[str]` | Get column type names | + +## `TableDescriptor` + +| Method | Description | +|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| +| `TableDescriptor(schema, *, partition_keys=None, bucket_count=None, bucket_keys=None, comment=None, log_format=None, kv_format=None, properties=None, custom_properties=None)` | Create table descriptor | +| `.get_schema() -> Schema` | Get the schema | + +## `TablePath` + +| Method / Property | Description | +|------------------------------|---------------------| +| `TablePath(database, table)` | Create a table path | +| `.database_name -> str` | Database name | +| `.table_name -> str` | Table name | + +## `TableInfo` + +| Property / Method | Description | +|--------------------------------------|-----------------------------| +| `.table_id -> int` | Table ID | +| `.table_path -> TablePath` | Table path | +| `.num_buckets -> int` | Number of buckets | +| `.schema_id -> int` | Schema ID | +| `.comment -> str \| None` | Table comment | +| `.created_time -> int` | Creation timestamp | +| `.modified_time -> int` | Last modification timestamp | +| `.get_primary_keys() -> list[str]` | Primary key columns | +| `.get_partition_keys() -> list[str]` | Partition columns | +| `.get_bucket_keys() -> list[str]` | Bucket key columns | +| `.has_primary_key() -> bool` | Has primary key? | +| `.is_partitioned() -> bool` | Is partitioned? | +| `.get_schema() -> Schema` | Get table schema | +| `.get_column_names() -> list[str]` | Column names | +| `.get_column_count() -> int` | Number of columns | +| `.get_properties() -> dict` | All table properties | +| `.get_custom_properties() -> dict` | Custom properties only | + +## `PartitionInfo` + +| Property | Description | +|--------------------------|----------------| +| `.partition_id -> int` | Partition ID | +| `.partition_name -> str` | Partition name | + +## `DatabaseDescriptor` + +| Method / Property | Description | +|------------------------------------------------------------|-------------------| +| `DatabaseDescriptor(comment=None, custom_properties=None)` | Create descriptor | +| `.comment -> str \| None` | Database comment | +| `.get_custom_properties() -> dict` | Custom properties | + +## `DatabaseInfo` + +| Property / Method | Description | +|----------------------------------------------------|-----------------------------| +| `.database_name -> str` | Database name | +| `.created_time -> int` | Creation timestamp | +| `.modified_time -> int` | Last modification timestamp | +| `.get_database_descriptor() -> DatabaseDescriptor` | Get descriptor | + +## `LakeSnapshot` + +| Property / Method | Description | +|---------------------------------------------------|-------------------------| +| `.snapshot_id -> int` | Snapshot ID | +| `.table_buckets_offset -> dict[TableBucket, int]` | All bucket offsets | +| `.get_bucket_offset(bucket) -> int \| None` | Get offset for a bucket | +| `.get_table_buckets() -> list[TableBucket]` | Get all buckets | + +## `TableBucket` + +| Method / Property | Description | +|--------------------------------------------------------------|----------------------------------------| +| `TableBucket(table_id, bucket)` | Create non-partitioned bucket | +| `TableBucket.with_partition(table_id, partition_id, bucket)` | Create partitioned bucket | +| `.table_id -> int` | Table ID | +| `.bucket_id -> int` | Bucket ID | +| `.partition_id -> int \| None` | Partition ID (None if non-partitioned) | + +## `FlussError` + +| Property | Description | +|-------------------|---------------| +| `.message -> str` | Error message | + +Raised for all Fluss-specific errors (connection failures, table not found, schema mismatches, etc.). Inherits from `Exception`. + +## Constants + +| Constant | Value | Description | +|------------------------------|---------------|-----------------------------------------------------| +| `fluss.EARLIEST_OFFSET` | `-2` | Start reading from earliest available offset | +| `fluss.OffsetType.EARLIEST` | `"earliest"` | For `list_offsets()` | +| `fluss.OffsetType.LATEST` | `"latest"` | For `list_offsets()` | +| `fluss.OffsetType.TIMESTAMP` | `"timestamp"` | For `list_offsets()` with timestamp | + +To start reading from the latest offset (only new records), resolve the current offset via `list_offsets` before subscribing: + +```python +offsets = await admin.list_offsets(table_path, [0], fluss.OffsetType.LATEST) +scanner.subscribe(bucket_id=0, start_offset=offsets[0]) +``` + +## `ChangeType` + +| Value | Short String | Description | +|-------------------------------|--------------|-------------------------------| +| `ChangeType.AppendOnly` (0) | `+A` | Append-only | +| `ChangeType.Insert` (1) | `+I` | Insert | +| `ChangeType.UpdateBefore` (2) | `-U` | Previous value of updated row | +| `ChangeType.UpdateAfter` (3) | `+U` | New value of updated row | +| `ChangeType.Delete` (4) | `-D` | Delete | diff --git a/website/docs/user-guide/python/data-types.md b/website/docs/user-guide/python/data-types.md new file mode 100644 index 00000000..608a49f9 --- /dev/null +++ b/website/docs/user-guide/python/data-types.md @@ -0,0 +1,21 @@ +--- +sidebar_position: 3 +--- +# Data Types + +The Python client uses PyArrow types for schema definitions: + +| PyArrow Type | Fluss Type | Python Type | +|-------------------------------------------------|-----------------------------------|---------------------| +| `pa.bool_()` | Boolean | `bool` | +| `pa.int8()` / `int16()` / `int32()` / `int64()` | TinyInt / SmallInt / Int / BigInt | `int` | +| `pa.float32()` / `float64()` | Float / Double | `float` | +| `pa.string()` | String | `str` | +| `pa.binary()` | Bytes | `bytes` | +| `pa.date32()` | Date | `datetime.date` | +| `pa.time32("ms")` | Time | `datetime.time` | +| `pa.timestamp("us")` | Timestamp (NTZ) | `datetime.datetime` | +| `pa.timestamp("us", tz="UTC")` | TimestampLTZ | `datetime.datetime` | +| `pa.decimal128(precision, scale)` | Decimal | `decimal.Decimal` | + +All Python native types (`date`, `time`, `datetime`, `Decimal`) work when appending rows via dicts. diff --git a/website/docs/user-guide/python/error-handling.md b/website/docs/user-guide/python/error-handling.md new file mode 100644 index 00000000..3f679485 --- /dev/null +++ b/website/docs/user-guide/python/error-handling.md @@ -0,0 +1,19 @@ +--- +sidebar_position: 4 +--- +# Error Handling + +The client raises `fluss.FlussError` for Fluss-specific errors: + +```python +try: + await admin.create_table(table_path, table_descriptor) +except fluss.FlussError as e: + print(f"Fluss error: {e.message}") +``` + +Common error scenarios: +- **Connection refused**: Fluss cluster is not running or wrong address in `bootstrap.servers` +- **Table not found**: table doesn't exist or wrong database/table name +- **Partition not found**: writing to a partitioned table before creating partitions +- **Schema mismatch**: row data doesn't match the table schema diff --git a/website/docs/user-guide/python/example/_category_.json b/website/docs/user-guide/python/example/_category_.json new file mode 100644 index 00000000..dd222949 --- /dev/null +++ b/website/docs/user-guide/python/example/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Example", + "position": 5 +} diff --git a/website/docs/user-guide/python/example/admin-operations.md b/website/docs/user-guide/python/example/admin-operations.md new file mode 100644 index 00000000..8c62ee78 --- /dev/null +++ b/website/docs/user-guide/python/example/admin-operations.md @@ -0,0 +1,77 @@ +--- +sidebar_position: 3 +--- +# Admin Operations + +```python +admin = await conn.get_admin() +``` + +## Databases + +```python +await admin.create_database("my_database", ignore_if_exists=True) +databases = await admin.list_databases() +exists = await admin.database_exists("my_database") +await admin.drop_database("my_database", ignore_if_not_exists=True, cascade=True) +``` + +## Tables + +Schemas are defined using PyArrow and wrapped in `fluss.Schema`: + +```python +import pyarrow as pa + +schema = fluss.Schema(pa.schema([ + pa.field("id", pa.int32()), + pa.field("name", pa.string()), + pa.field("amount", pa.int64()), +])) + +table_path = fluss.TablePath("my_database", "my_table") +await admin.create_table(table_path, fluss.TableDescriptor(schema), ignore_if_exists=True) + +table_info = await admin.get_table_info(table_path) +tables = await admin.list_tables("my_database") +await admin.drop_table(table_path, ignore_if_not_exists=True) +``` + +### TableDescriptor Options + +`TableDescriptor` accepts these optional parameters: + +| Parameter | Description | +|---------------------|-------------------------------------------------------------------------------------| +| `partition_keys` | Column names to partition by (e.g. `["region"]`) | +| `bucket_count` | Number of buckets (parallelism units) for the table | +| `bucket_keys` | Columns used to determine bucket assignment | +| `comment` | Table comment / description | +| `log_format` | Log storage format: `"ARROW"` or `"INDEXED"` | +| `kv_format` | KV storage format for primary key tables: `"INDEXED"` or `"COMPACTED"` | +| `properties` | Table configuration properties as a dict (e.g. `{"table.replication.factor": "1"}`) | +| `custom_properties` | User-defined properties as a dict | + +## Offsets + +```python +# Latest offsets for buckets +offsets = await admin.list_offsets(table_path, bucket_ids=[0, 1], offset_type="latest") + +# By timestamp +offsets = await admin.list_offsets(table_path, bucket_ids=[0], offset_type="timestamp", timestamp=1704067200000) + +# Per-partition offsets +offsets = await admin.list_partition_offsets(table_path, partition_name="US", bucket_ids=[0], offset_type="latest") +``` + +## Lake Snapshot + +```python +snapshot = await admin.get_latest_lake_snapshot(table_path) +print(f"Snapshot ID: {snapshot.snapshot_id}") +print(f"Table buckets: {snapshot.get_table_buckets()}") + +bucket = fluss.TableBucket(table_id=1, bucket=0) +offset = snapshot.get_bucket_offset(bucket) +``` diff --git a/website/docs/user-guide/python/example/configuration.md b/website/docs/user-guide/python/example/configuration.md new file mode 100644 index 00000000..8e88d2a6 --- /dev/null +++ b/website/docs/user-guide/python/example/configuration.md @@ -0,0 +1,34 @@ +--- +sidebar_position: 2 +--- +# Configuration + +```python +import fluss + +config = fluss.Config({"bootstrap.servers": "127.0.0.1:9123"}) +conn = await fluss.FlussConnection.create(config) +``` + +The connection also supports context managers: + +```python +with await fluss.FlussConnection.create(config) as conn: + ... +``` + +## Configuration Options + +| Key | Description | Default | +|---------------------|-------------------------------------------------------|--------------------| +| `bootstrap.servers` | Coordinator server address | `127.0.0.1:9123` | +| `request.max.size` | Maximum request size in bytes | `10485760` (10 MB) | +| `writer.acks` | Acknowledgment setting (`all` waits for all replicas) | `all` | +| `writer.retries` | Number of retries on failure | `2147483647` | +| `writer.batch.size` | Batch size for writes in bytes | `2097152` (2 MB) | + +Remember to close the connection when done: + +```python +conn.close() +``` diff --git a/website/docs/user-guide/python/example/index.md b/website/docs/user-guide/python/example/index.md new file mode 100644 index 00000000..389b6486 --- /dev/null +++ b/website/docs/user-guide/python/example/index.md @@ -0,0 +1,46 @@ +--- +sidebar_position: 1 +--- +# Example + +Minimal working example: connect to Fluss, create a table, write data, and read it back. + +```python +import asyncio +import pyarrow as pa +import fluss + +async def main(): + # Connect + config = fluss.Config({"bootstrap.servers": "127.0.0.1:9123"}) + conn = await fluss.FlussConnection.create(config) + admin = await conn.get_admin() + + # Create a log table + schema = fluss.Schema(pa.schema([ + pa.field("id", pa.int32()), + pa.field("name", pa.string()), + pa.field("score", pa.float32()), + ])) + table_path = fluss.TablePath("fluss", "quick_start") + await admin.create_table(table_path, fluss.TableDescriptor(schema), ignore_if_exists=True) + + # Write + table = await conn.get_table(table_path) + writer = table.new_append().create_writer() + writer.append({"id": 1, "name": "Alice", "score": 95.5}) + writer.append({"id": 2, "name": "Bob", "score": 87.0}) + await writer.flush() + + # Read + num_buckets = (await admin.get_table_info(table_path)).num_buckets + scanner = await table.new_scan().create_record_batch_log_scanner() + scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + print(scanner.to_pandas()) + + # Cleanup + await admin.drop_table(table_path, ignore_if_not_exists=True) + conn.close() + +asyncio.run(main()) +``` diff --git a/website/docs/user-guide/python/example/log-tables.md b/website/docs/user-guide/python/example/log-tables.md new file mode 100644 index 00000000..63903a4e --- /dev/null +++ b/website/docs/user-guide/python/example/log-tables.md @@ -0,0 +1,122 @@ +--- +sidebar_position: 4 +--- +# Log Tables + +Log tables are append-only tables without primary keys, suitable for event streaming. + +## Creating a Log Table + +```python +import pyarrow as pa + +schema = fluss.Schema(pa.schema([ + pa.field("id", pa.int32()), + pa.field("name", pa.string()), + pa.field("score", pa.float32()), +])) + +table_path = fluss.TablePath("fluss", "events") +await admin.create_table(table_path, fluss.TableDescriptor(schema), ignore_if_exists=True) +``` + +## Writing + +Rows can be appended as dicts, lists, or tuples. For bulk writes, use `write_arrow()`, `write_arrow_batch()`, or `write_pandas()`. + +Write methods like `append()` and `write_arrow_batch()` return a `WriteResultHandle`. You can ignore it for fire-and-forget semantics (flush at the end), or `await handle.wait()` to block until the server acknowledges that specific write. + +```python +table = await conn.get_table(table_path) +writer = table.new_append().create_writer() + +# Fire-and-forget: queue writes, flush at the end +writer.append({"id": 1, "name": "Alice", "score": 95.5}) +writer.append([2, "Bob", 87.0]) +await writer.flush() + +# Per-record acknowledgment +handle = writer.append({"id": 3, "name": "Charlie", "score": 91.0}) +await handle.wait() + +# Bulk writes +writer.write_arrow(pa_table) # PyArrow Table +writer.write_arrow_batch(record_batch) # PyArrow RecordBatch +writer.write_pandas(df) # Pandas DataFrame +await writer.flush() +``` + +## Reading + +There are two scanner types: +- **Batch scanner** (`create_record_batch_log_scanner()`): returns Arrow Tables or DataFrames, best for analytics +- **Record scanner** (`create_log_scanner()`): returns individual records with metadata (offset, timestamp, change type), best for streaming + +And two reading modes: +- **`to_arrow()` / `to_pandas()`**: reads all data from subscribed buckets up to the current latest offset, then returns. Best for one-shot batch reads. +- **`poll_arrow()` / `poll()` / `poll_record_batch()`**: returns whatever data is available within the timeout, then returns. Call in a loop for continuous streaming. + +### Batch Read (One-Shot) + +```python +num_buckets = (await admin.get_table_info(table_path)).num_buckets + +scanner = await table.new_scan().create_record_batch_log_scanner() +scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + +# Reads everything up to current latest offset, then returns +arrow_table = scanner.to_arrow() +df = scanner.to_pandas() +``` + +### Continuous Polling + +Use `poll_arrow()` or `poll()` in a loop for streaming consumption: + +```python +# Batch scanner: poll as Arrow Tables +scanner = await table.new_scan().create_record_batch_log_scanner() +scanner.subscribe(bucket_id=0, start_offset=fluss.EARLIEST_OFFSET) + +while True: + result = scanner.poll_arrow(timeout_ms=5000) + if result.num_rows > 0: + print(result.to_pandas()) + +# Record scanner: poll individual records with metadata +scanner = await table.new_scan().create_log_scanner() +scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + +while True: + for record in scanner.poll(timeout_ms=5000): + print(f"offset={record.offset}, change={record.change_type.short_string()}, row={record.row}") +``` + +### Unsubscribing + +To stop consuming from a bucket, use `unsubscribe()`: + +```python +scanner.unsubscribe(bucket_id=0) +``` + +### Subscribe from Latest Offset + +To only consume new records (skip existing data), first resolve the current latest offset via `list_offsets`, then subscribe at that offset: + +```python +admin = await conn.get_admin() +offsets = await admin.list_offsets(table_path, [0], fluss.OffsetType.LATEST) +latest = offsets[0] + +scanner = await table.new_scan().create_record_batch_log_scanner() +scanner.subscribe(bucket_id=0, start_offset=latest) +``` + +## Column Projection + +```python +scanner = await table.new_scan().project([0, 2]).create_record_batch_log_scanner() +# or by name +scanner = await table.new_scan().project_by_name(["id", "score"]).create_record_batch_log_scanner() +``` diff --git a/website/docs/user-guide/python/example/partitioned-tables.md b/website/docs/user-guide/python/example/partitioned-tables.md new file mode 100644 index 00000000..f8280920 --- /dev/null +++ b/website/docs/user-guide/python/example/partitioned-tables.md @@ -0,0 +1,104 @@ +--- +sidebar_position: 6 +--- +# Partitioned Tables + +Partitioned tables distribute data across partitions based on column values. Partitions must exist before writing data, otherwise the client will by default retry indefinitely. + +## Creating and Managing Partitions + +```python +import pyarrow as pa + +schema = fluss.Schema(pa.schema([ + pa.field("id", pa.int32()), + pa.field("region", pa.string()), + pa.field("value", pa.int64()), +])) + +table_path = fluss.TablePath("fluss", "partitioned_events") +await admin.create_table( + table_path, + fluss.TableDescriptor(schema, partition_keys=["region"], bucket_count=1), + ignore_if_exists=True, +) + +# Create partitions +await admin.create_partition(table_path, {"region": "US"}, ignore_if_exists=True) +await admin.create_partition(table_path, {"region": "EU"}, ignore_if_exists=True) + +# List partitions +partition_infos = await admin.list_partition_infos(table_path) +``` + +## Writing + +Same as non-partitioned tables - include partition column values in each row. **Partitions must exist before writing data, otherwise the client will by default retry indefinitely.** + +```python +table = await conn.get_table(table_path) +writer = table.new_append().create_writer() +writer.append({"id": 1, "region": "US", "value": 100}) +writer.append({"id": 2, "region": "EU", "value": 200}) +await writer.flush() +``` + +## Reading + +Use `subscribe_partition()` or `subscribe_partition_buckets()` instead of `subscribe()`: + +```python +scanner = await table.new_scan().create_record_batch_log_scanner() + +# Subscribe to individual partitions +for p in partition_infos: + scanner.subscribe_partition(partition_id=p.partition_id, bucket_id=0, start_offset=fluss.EARLIEST_OFFSET) + +# Or batch-subscribe +scanner.subscribe_partition_buckets({ + (p.partition_id, 0): fluss.EARLIEST_OFFSET for p in partition_infos +}) + +print(scanner.to_pandas()) +``` + +### Unsubscribing + +To stop consuming from a specific partition bucket, use `unsubscribe_partition()`: + +```python +scanner.unsubscribe_partition(partition_id=partition_infos[0].partition_id, bucket_id=0) +``` + +## Partitioned Primary Key Tables + +Partition columns must be part of the primary key. Partitions must exist before upserting data, otherwise the client will by default retry indefinitely. + +```python +schema = fluss.Schema( + pa.schema([ + pa.field("user_id", pa.int32()), + pa.field("region", pa.string()), + pa.field("score", pa.int64()), + ]), + primary_keys=["user_id", "region"], +) + +table_path = fluss.TablePath("fluss", "partitioned_users") +await admin.create_table( + table_path, + fluss.TableDescriptor(schema, partition_keys=["region"]), + ignore_if_exists=True, +) + +await admin.create_partition(table_path, {"region": "US"}, ignore_if_exists=True) + +table = await conn.get_table(table_path) +writer = table.new_upsert().create_writer() +writer.upsert({"user_id": 1, "region": "US", "score": 1234}) +await writer.flush() + +# Lookup includes partition columns +lookuper = table.new_lookup().create_lookuper() +result = await lookuper.lookup({"user_id": 1, "region": "US"}) +``` diff --git a/website/docs/user-guide/python/example/primary-key-tables.md b/website/docs/user-guide/python/example/primary-key-tables.md new file mode 100644 index 00000000..cd61e508 --- /dev/null +++ b/website/docs/user-guide/python/example/primary-key-tables.md @@ -0,0 +1,61 @@ +--- +sidebar_position: 5 +--- +# Primary Key Tables + +Primary key tables support upsert, delete, and point lookup operations. + +## Creating a Primary Key Table + +Pass `primary_keys` to `fluss.Schema`: + +```python +import pyarrow as pa + +schema = fluss.Schema( + pa.schema([ + pa.field("id", pa.int32()), + pa.field("name", pa.string()), + pa.field("age", pa.int64()), + ]), + primary_keys=["id"], +) +table_path = fluss.TablePath("fluss", "users") +await admin.create_table(table_path, fluss.TableDescriptor(schema, bucket_count=3), ignore_if_exists=True) +``` + +## Upsert, Delete, Lookup + +```python +table = await conn.get_table(table_path) + +# Upsert (fire-and-forget, flush at the end) +writer = table.new_upsert().create_writer() +writer.upsert({"id": 1, "name": "Alice", "age": 25}) +writer.upsert({"id": 2, "name": "Bob", "age": 30}) +await writer.flush() + +# Per-record acknowledgment (for read-after-write) +handle = writer.upsert({"id": 3, "name": "Charlie", "age": 35}) +await handle.wait() + +# Delete by primary key +handle = writer.delete({"id": 2}) +await handle.wait() + +# Lookup +lookuper = table.new_lookup().create_lookuper() +result = await lookuper.lookup({"id": 1}) +if result: + print(f"Found: name={result['name']}, age={result['age']}") +``` + +## Partial Updates + +Update specific columns while preserving others: + +```python +partial_writer = table.new_upsert().partial_update_by_name(["id", "age"]).create_writer() +partial_writer.upsert({"id": 1, "age": 27}) # only updates age +await partial_writer.flush() +``` diff --git a/website/docs/user-guide/python/installation.md b/website/docs/user-guide/python/installation.md new file mode 100644 index 00000000..d5918aea --- /dev/null +++ b/website/docs/user-guide/python/installation.md @@ -0,0 +1,41 @@ +--- +sidebar_position: 1 +--- +# Installation + +```bash +pip install pyfluss +``` + +To build from source instead: + +**Prerequisites:** Python 3.9+, Rust 1.85+ + +```bash +git clone https://github.com/apache/fluss-rust.git +cd fluss-rust/bindings/python +``` + +Install [maturin](https://github.com/PyO3/maturin): + +```bash +pip install maturin +``` + +Build and install: + +```bash +# Development mode (editable) +maturin develop + +# Or build a wheel +maturin build --release +pip install target/wheels/fluss-*.whl +``` + +Verify: + +```python +import fluss +print("Fluss Python bindings installed successfully!") +``` diff --git a/website/docs/user-guide/rust/_category_.json b/website/docs/user-guide/rust/_category_.json new file mode 100644 index 00000000..cdec432d --- /dev/null +++ b/website/docs/user-guide/rust/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Rust", + "position": 1 +} diff --git a/website/docs/user-guide/rust/api-reference.md b/website/docs/user-guide/rust/api-reference.md new file mode 100644 index 00000000..4f694444 --- /dev/null +++ b/website/docs/user-guide/rust/api-reference.md @@ -0,0 +1,441 @@ +--- +sidebar_position: 2 +--- +# API Reference + +Complete API reference for the Fluss Rust client. + +## `Config` + +| Field | Type | Default | Description | +|-----------------------------------|----------|--------------------|---------------------------------------------------------| +| `bootstrap_servers` | `String` | `"127.0.0.1:9123"` | Coordinator server address | +| `writer_request_max_size` | `i32` | `10485760` (10 MB) | Maximum request size in bytes | +| `writer_acks` | `String` | `"all"` | Acknowledgment setting (`"all"` waits for all replicas) | +| `writer_retries` | `i32` | `i32::MAX` | Number of retries on failure | +| `writer_batch_size` | `i32` | `2097152` (2 MB) | Batch size for writes in bytes | +| `scanner_remote_log_prefetch_num` | `usize` | `4` | Number of remote log segments to prefetch | +| `remote_file_download_thread_num` | `usize` | `3` | Number of threads for remote log downloads | + +## `FlussConnection` + +| Method | Description | +|-------------------------------------------------------------------------------|------------------------------------------------| +| `async fn new(config: Config) -> Result` | Create a new connection to a Fluss cluster | +| `async fn get_admin(&self) -> Result` | Get the admin interface for cluster management | +| `async fn get_table(&self, table_path: &TablePath) -> Result>` | Get a table for read/write operations | +| `fn config(&self) -> &Config` | Get a reference to the connection config | + +## `FlussAdmin` + +### Database Operations + +| Method | Description | +|------------------------------------------------------------------------------------------------------------------------------|----------------------------| +| `async fn create_database(&self, name: &str, descriptor: Option<&DatabaseDescriptor>, ignore_if_exists: bool) -> Result<()>` | Create a database | +| `async fn drop_database(&self, name: &str, ignore_if_not_exists: bool, cascade: bool) -> Result<()>` | Drop a database | +| `async fn list_databases(&self) -> Result>` | List all databases | +| `async fn database_exists(&self, name: &str) -> Result` | Check if a database exists | +| `async fn get_database_info(&self, name: &str) -> Result` | Get database metadata | + +### Table Operations + +| Method | Description | +|----------------------------------------------------------------------------------------------------------------------------|---------------------------| +| `async fn create_table(&self, table_path: &TablePath, descriptor: &TableDescriptor, ignore_if_exists: bool) -> Result<()>` | Create a table | +| `async fn drop_table(&self, table_path: &TablePath, ignore_if_not_exists: bool) -> Result<()>` | Drop a table | +| `async fn get_table_info(&self, table_path: &TablePath) -> Result` | Get table metadata | +| `async fn list_tables(&self, database_name: &str) -> Result>` | List tables in a database | +| `async fn table_exists(&self, table_path: &TablePath) -> Result` | Check if a table exists | + +### Partition Operations + +| Method | Description | +|--------------------------------------------------------------------------------------------------------------------------|---------------------| +| `async fn list_partition_infos(&self, table_path: &TablePath) -> Result>` | List all partitions | +| `async fn create_partition(&self, table_path: &TablePath, spec: &PartitionSpec, ignore_if_exists: bool) -> Result<()>` | Create a partition | +| `async fn drop_partition(&self, table_path: &TablePath, spec: &PartitionSpec, ignore_if_not_exists: bool) -> Result<()>` | Drop a partition | + +### Offset Operations + +| Method | Description | +|------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------| +| `async fn list_offsets(&self, table_path: &TablePath, bucket_ids: &[i32], offset_spec: OffsetSpec) -> Result>` | Get offsets for buckets | +| `async fn list_partition_offsets(&self, table_path: &TablePath, partition_name: &str, bucket_ids: &[i32], offset_spec: OffsetSpec) -> Result>` | Get offsets for a partition's buckets | + +### Lake Operations + +| Method | Description | +|--------------------------------------------------------------------------------------------|------------------------------| +| `async fn get_latest_lake_snapshot(&self, table_path: &TablePath) -> Result` | Get the latest lake snapshot | + +## `FlussTable<'a>` + +| Method | Description | +|-----------------------------------------------|-----------------------------------------| +| `fn get_table_info(&self) -> &TableInfo` | Get table metadata | +| `fn new_append(&self) -> Result` | Create an append builder for log tables | +| `fn new_scan(&self) -> TableScan<'_>` | Create a scan builder | +| `fn new_lookup(&self) -> Result` | Create a lookup builder for PK tables | +| `fn new_upsert(&self) -> Result` | Create an upsert builder for PK tables | +| `fn has_primary_key(&self) -> bool` | Check if the table has a primary key | +| `fn table_path(&self) -> &TablePath` | Get the table path | + +## `TableAppend` + +| Method | Description | +|---------------------------------------------------|-------------------------| +| `fn create_writer(&self) -> Result` | Create an append writer | + +## `AppendWriter` + +| Method | Description | +|---------------------------------------------------------------------------------|---------------------------------------------------| +| `fn append(&self, row: &impl InternalRow) -> Result` | Append a row; returns a future for acknowledgment | +| `fn append_arrow_batch(&self, batch: RecordBatch) -> Result` | Append an Arrow RecordBatch | +| `async fn flush(&self) -> Result<()>` | Flush all pending writes to the server | + +## `TableScan<'a>` + +| Method | Description | +|-----------------------------------------------------------------------------|-----------------------------------------| +| `fn project(self, indices: &[usize]) -> Result` | Project columns by index | +| `fn project_by_name(self, names: &[&str]) -> Result` | Project columns by name | +| `fn create_log_scanner(self) -> Result` | Create a record-based log scanner | +| `fn create_record_batch_log_scanner(self) -> Result` | Create an Arrow batch-based log scanner | + +## `LogScanner` + +| Method | Description | +|-----------------------------------------------------------------------------------------------------------|----------------------------------------------------------| +| `async fn subscribe(&self, bucket_id: i32, start_offset: i64) -> Result<()>` | Subscribe to a bucket | +| `async fn subscribe_buckets(&self, bucket_offsets: &HashMap) -> Result<()>` | Subscribe to multiple buckets | +| `async fn subscribe_partition(&self, partition_id: i64, bucket_id: i32, start_offset: i64) -> Result<()>` | Subscribe to a partition bucket | +| `async fn subscribe_partition_buckets(&self, offsets: &HashMap<(i64, i32), i64>) -> Result<()>` | Subscribe to multiple partition-bucket pairs | +| `async fn unsubscribe(&self, bucket_id: i32) -> Result<()>` | Unsubscribe from a bucket (non-partitioned tables) | +| `async fn unsubscribe_partition(&self, partition_id: i64, bucket_id: i32) -> Result<()>` | Unsubscribe from a partition bucket (partitioned tables) | +| `async fn poll(&self, timeout: Duration) -> Result` | Poll for records | + +## `RecordBatchLogScanner` + +| Method | Description | +|-----------------------------------------------------------------------------------------------------------|----------------------------------------------------------| +| `async fn subscribe(&self, bucket_id: i32, start_offset: i64) -> Result<()>` | Subscribe to a bucket | +| `async fn subscribe_buckets(&self, bucket_offsets: &HashMap) -> Result<()>` | Subscribe to multiple buckets | +| `async fn subscribe_partition(&self, partition_id: i64, bucket_id: i32, start_offset: i64) -> Result<()>` | Subscribe to a partition bucket | +| `async fn subscribe_partition_buckets(&self, offsets: &HashMap<(i64, i32), i64>) -> Result<()>` | Subscribe to multiple partition-bucket pairs | +| `async fn unsubscribe(&self, bucket_id: i32) -> Result<()>` | Unsubscribe from a bucket (non-partitioned tables) | +| `async fn unsubscribe_partition(&self, partition_id: i64, bucket_id: i32) -> Result<()>` | Unsubscribe from a partition bucket (partitioned tables) | +| `async fn poll(&self, timeout: Duration) -> Result>` | Poll for Arrow record batches | + +## `ScanRecord` + +| Method | Description | +|----------------------------------------|----------------------------------------| +| `fn row(&self) -> &dyn InternalRow` | Get the row data | +| `fn offset(&self) -> i64` | Record offset in the log | +| `fn timestamp(&self) -> i64` | Record timestamp | +| `fn change_type(&self) -> &ChangeType` | Change type (AppendOnly, Insert, etc.) | + +## `ScanRecords` + +| Method | Description | +|--------------------------------------------------------------------------|-----------------------------------| +| `fn count(&self) -> usize` | Number of records | +| `fn is_empty(&self) -> bool` | Whether the result set is empty | +| `fn records(&self, bucket: &TableBucket) -> &[ScanRecord]` | Get records for a specific bucket | +| `fn records_by_buckets(&self) -> &HashMap>` | Get all records grouped by bucket | + +`ScanRecords` also implements `IntoIterator`, so you can iterate over all records directly: + +```rust +for record in records { + println!("offset={}", record.offset()); +} +``` + +## `ScanBatch` + +| Method | Description | +|------------------------------------|--------------------------------| +| `fn bucket(&self) -> &TableBucket` | Bucket this batch belongs to | +| `fn batch(&self) -> &RecordBatch` | Arrow RecordBatch data | +| `fn base_offset(&self) -> i64` | First record offset | +| `fn last_offset(&self) -> i64` | Last record offset | +| `fn num_records(&self) -> usize` | Number of records in the batch | + +## `TableUpsert` + +| Method | Description | +|---------------------------------------------------------------------------------------|---------------------------------------------------| +| `fn create_writer(&self) -> Result` | Create an upsert writer | +| `fn partial_update(&self, column_indices: Option>) -> Result` | Create a partial update builder by column indices | +| `fn partial_update_with_column_names(&self, names: &[&str]) -> Result` | Create a partial update builder by column names | + +## `UpsertWriter` + +| Method | Description | +|-------------------------------------------------------------------------|---------------------------------------| +| `fn upsert(&self, row: &impl InternalRow) -> Result` | Upsert a row (insert or update by PK) | +| `fn delete(&self, row: &impl InternalRow) -> Result` | Delete a row by primary key | +| `async fn flush(&self) -> Result<()>` | Flush all pending operations | + +## `TableLookup` + +| Method | Description | +|-------------------------------------------------|-------------------------------------| +| `fn create_lookuper(&self) -> Result` | Create a lookuper for point lookups | + +## `Lookuper` + +| Method | Description | +|------------------------------------------------------------------------------|-----------------------------| +| `async fn lookup(&mut self, key: &impl InternalRow) -> Result` | Lookup a row by primary key | + +## `LookupResult` + +| Method | Description | +|----------------------------------------------------------------|----------------------------------| +| `fn get_single_row(&self) -> Result>` | Get a single row from the result | +| `fn get_rows(&self) -> Vec` | Get all rows from the result | + +## `WriteResultFuture` + +| Description | +|-----------------------------------------------------------------------------------------------------------------------------------------------| +| Implements `Future>`. Await to wait for server acknowledgment. Returned by `append()`, `upsert()`, and `delete()`. | + +Usage: + +```rust +// Fire-and-forget (batched) +writer.append(&row)?; +writer.flush().await?; + +// Per-record acknowledgment +writer.append(&row)?.await?; +``` + +## `Schema` + +| Method | Description | +|------------------------------------------------|------------------------------------------| +| `fn builder() -> SchemaBuilder` | Create a schema builder | +| `fn columns(&self) -> &[Column]` | Get all columns | +| `fn primary_key(&self) -> Option<&PrimaryKey>` | Get primary key (None if no primary key) | +| `fn column_names(&self) -> Vec<&str>` | Get all column names | +| `fn primary_key_indexes(&self) -> Vec` | Get primary key column indices | + +## `SchemaBuilder` + +| Method | Description | +|------------------------------------------------------|-------------------------| +| `fn column(name: &str, data_type: DataType) -> Self` | Add a column | +| `fn primary_key(keys: Vec<&str>) -> Self` | Set primary key columns | +| `fn build() -> Result` | Build the schema | + +## `TableDescriptor` + +| Method | Description | +|----------------------------------------------------|--------------------------------------| +| `fn builder() -> TableDescriptorBuilder` | Create a table descriptor builder | +| `fn schema(&self) -> &Schema` | Get the table schema | +| `fn partition_keys(&self) -> &[String]` | Get partition key column names | +| `fn has_primary_key(&self) -> bool` | Check if the table has a primary key | +| `fn properties(&self) -> &HashMap` | Get all table properties | +| `fn comment(&self) -> Option<&str>` | Get table comment | + +## `TableDescriptorBuilder` + +| Method | Description | +|----------------------------------------------------------------------------------|---------------------------------------------| +| `fn schema(schema: Schema) -> Self` | Set the schema | +| `fn log_format(format: LogFormat) -> Self` | Set log format (e.g., `LogFormat::ARROW`) | +| `fn kv_format(format: KvFormat) -> Self` | Set KV format (e.g., `KvFormat::COMPACTED`) | +| `fn property(key: &str, value: &str) -> Self` | Set a table property | +| `fn partitioned_by(keys: Vec<&str>) -> Self` | Set partition columns | +| `fn distributed_by(bucket_count: Option, bucket_keys: Vec) -> Self` | Set bucket distribution | +| `fn comment(comment: &str) -> Self` | Set table comment | +| `fn build() -> Result` | Build the table descriptor | + +## `TablePath` + +| Method | Description | +|-------------------------------------------------------|---------------------| +| `TablePath::new(database: &str, table: &str) -> Self` | Create a table path | +| `fn database(&self) -> &str` | Get database name | +| `fn table(&self) -> &str` | Get table name | + +## `TableInfo` + +| Field / Method | Description | +|----------------------|-----------------------------------------------------| +| `.table_path` | `TablePath` -- Table path | +| `.table_id` | `i64` -- Table ID | +| `.schema_id` | `i32` -- Schema ID | +| `.schema` | `Schema` -- Table schema | +| `.primary_keys` | `Vec` -- Primary key column names | +| `.partition_keys` | `Vec` -- Partition key column names | +| `.num_buckets` | `i32` -- Number of buckets | +| `.properties` | `HashMap` -- All table properties | +| `.custom_properties` | `HashMap` -- Custom properties only | +| `.comment` | `Option` -- Table comment | +| `.created_time` | `i64` -- Creation timestamp | +| `.modified_time` | `i64` -- Last modification timestamp | + +## `TableBucket` + +| Method | Description | +|-----------------------------------------------------------------------------------------------------|--------------------------------------------| +| `TableBucket::new(table_id: i64, bucket_id: i32) -> Self` | Create a non-partitioned bucket | +| `TableBucket::new_with_partition(table_id: i64, partition_id: Option, bucket_id: i32) -> Self` | Create a partitioned bucket | +| `fn table_id(&self) -> i64` | Get table ID | +| `fn partition_id(&self) -> Option` | Get partition ID (None if non-partitioned) | +| `fn bucket_id(&self) -> i32` | Get bucket ID | + +## `PartitionSpec` + +| Method | Description | +|-------------------------------------------------------------|-------------------------------------------------------| +| `PartitionSpec::new(spec_map: HashMap<&str, &str>) -> Self` | Create from a map of partition column names to values | +| `fn get_spec_map(&self) -> &HashMap` | Get the partition spec map | + +## `PartitionInfo` + +| Method | Description | +|------------------------------------------|--------------------| +| `fn get_partition_id(&self) -> i64` | Get partition ID | +| `fn get_partition_name(&self) -> String` | Get partition name | + +## `DatabaseDescriptor` + +| Method | Description | +|-----------------------------------------------------------|--------------------------------------| +| `fn builder() -> DatabaseDescriptorBuilder` | Create a database descriptor builder | +| `fn comment(&self) -> Option<&str>` | Get database comment | +| `fn custom_properties(&self) -> &HashMap` | Get custom properties | + +## `DatabaseDescriptorBuilder` + +| Method | Description | +|-------------------------------------------------------------------------------------------|-------------------------------| +| `fn comment(comment: impl Into) -> Self` | Set database comment | +| `fn custom_properties(properties: HashMap, impl Into>) -> Self` | Set custom properties | +| `fn custom_property(key: impl Into, value: impl Into) -> Self` | Set a single custom property | +| `fn build() -> DatabaseDescriptor` | Build the database descriptor | + +## `DatabaseInfo` + +| Method | Description | +|--------------------------------------------------------|---------------------------------| +| `fn database_name(&self) -> &str` | Get database name | +| `fn created_time(&self) -> i64` | Get creation timestamp | +| `fn modified_time(&self) -> i64` | Get last modification timestamp | +| `fn database_descriptor(&self) -> &DatabaseDescriptor` | Get the database descriptor | + +## `LakeSnapshot` + +| Field | Description | +|-------------------------|---------------------------------------------------| +| `.snapshot_id` | `i64` -- Snapshot ID | +| `.table_buckets_offset` | `HashMap` -- All bucket offsets | + +## `GenericRow<'a>` + +| Method | Description | +|--------------------------------------------------------------------|--------------------------------------------------| +| `GenericRow::new(field_count: usize) -> Self` | Create a new row with the given number of fields | +| `fn set_field(&mut self, pos: usize, value: impl Into>)` | Set a field value by position | +| `GenericRow::from_data(data: Vec>>) -> Self` | Create a row from existing field data | + +Implements the `InternalRow` trait (see below). + +## `InternalRow` trait + +| Method | Description | +|--------------------------------------------------------------------------------|-----------------------------------------| +| `fn get_boolean(&self, idx: usize) -> bool` | Get boolean value | +| `fn get_byte(&self, idx: usize) -> i8` | Get tinyint value | +| `fn get_short(&self, idx: usize) -> i16` | Get smallint value | +| `fn get_int(&self, idx: usize) -> i32` | Get int value | +| `fn get_long(&self, idx: usize) -> i64` | Get bigint value | +| `fn get_float(&self, idx: usize) -> f32` | Get float value | +| `fn get_double(&self, idx: usize) -> f64` | Get double value | +| `fn get_string(&self, idx: usize) -> &str` | Get string value | +| `fn get_decimal(&self, idx: usize, precision: usize, scale: usize) -> Decimal` | Get decimal value | +| `fn get_date(&self, idx: usize) -> Date` | Get date value | +| `fn get_time(&self, idx: usize) -> Time` | Get time value | +| `fn get_timestamp_ntz(&self, idx: usize, precision: u32) -> TimestampNtz` | Get timestamp value | +| `fn get_timestamp_ltz(&self, idx: usize, precision: u32) -> TimestampLtz` | Get timestamp with local timezone value | +| `fn get_bytes(&self, idx: usize) -> &[u8]` | Get bytes value | +| `fn get_binary(&self, idx: usize, length: usize) -> &[u8]` | Get fixed-length binary value | +| `fn get_char(&self, idx: usize, length: usize) -> &str` | Get fixed-length char value | + +## `ChangeType` + +| Value | Short String | Description | +|----------------------------|---------------|----------------------------------| +| `ChangeType::AppendOnly` | `+A` | Append-only record | +| `ChangeType::Insert` | `+I` | Inserted row | +| `ChangeType::UpdateBefore` | `-U` | Previous value of an updated row | +| `ChangeType::UpdateAfter` | `+U` | New value of an updated row | +| `ChangeType::Delete` | `-D` | Deleted row | + +| Method | Description | +|----------------------------------|-------------------------------------| +| `fn short_string(&self) -> &str` | Get the short string representation | + +## `OffsetSpec` + +| Variant | Description | +|------------------------------|-------------------------------------------------| +| `OffsetSpec::Earliest` | Start from the earliest available offset | +| `OffsetSpec::Latest` | Start from the latest offset (only new records) | +| `OffsetSpec::Timestamp(i64)` | Start from a specific timestamp in milliseconds | + +## Constants + +| Constant | Value | Description | +|----------------------------------|--------|---------------------------------------------------------| +| `fluss::client::EARLIEST_OFFSET` | `-2` | Start reading from the earliest available offset | + +To start reading from the latest offset (only new records), resolve the current offset via `list_offsets` before subscribing: + +```rust +use fluss::rpc::message::OffsetSpec; + +let offsets = admin.list_offsets(&table_path, &[0], OffsetSpec::Latest).await?; +let latest = offsets[&0]; +log_scanner.subscribe(0, latest).await?; +``` + +## `DataTypes` factory + +| Method | Returns | Description | +|--------------------------------------------------|------------|------------------------------------| +| `DataTypes::boolean()` | `DataType` | Boolean type | +| `DataTypes::tinyint()` | `DataType` | 8-bit signed integer | +| `DataTypes::smallint()` | `DataType` | 16-bit signed integer | +| `DataTypes::int()` | `DataType` | 32-bit signed integer | +| `DataTypes::bigint()` | `DataType` | 64-bit signed integer | +| `DataTypes::float()` | `DataType` | 32-bit floating point | +| `DataTypes::double()` | `DataType` | 64-bit floating point | +| `DataTypes::string()` | `DataType` | Variable-length string | +| `DataTypes::bytes()` | `DataType` | Variable-length byte array | +| `DataTypes::date()` | `DataType` | Date (days since epoch) | +| `DataTypes::time()` | `DataType` | Time (milliseconds since midnight) | +| `DataTypes::timestamp()` | `DataType` | Timestamp without timezone | +| `DataTypes::timestamp_ltz()` | `DataType` | Timestamp with local timezone | +| `DataTypes::decimal(precision: u32, scale: u32)` | `DataType` | Fixed-point decimal | +| `DataTypes::char(length: u32)` | `DataType` | Fixed-length string | +| `DataTypes::binary(length: usize)` | `DataType` | Fixed-length byte array | +| `DataTypes::array(element: DataType)` | `DataType` | Array of elements | +| `DataTypes::map(key: DataType, value: DataType)` | `DataType` | Map of key-value pairs | +| `DataTypes::row(fields: Vec)` | `DataType` | Nested row type | + +## `DataField` + +| Method | Description | +|----------------------------------------------------------------------------------------------------------|---------------------| +| `DataField::new(name: impl Into, data_type: DataType, description: Option) -> DataField` | Create a data field | +| `fn name(&self) -> &str` | Get the field name | diff --git a/website/docs/user-guide/rust/data-types.md b/website/docs/user-guide/rust/data-types.md new file mode 100644 index 00000000..f5b55345 --- /dev/null +++ b/website/docs/user-guide/rust/data-types.md @@ -0,0 +1,57 @@ +--- +sidebar_position: 3 +--- +# Data Types + +| Fluss Type | Rust Type | Getter | Setter | +|-----------------|----------------|--------------------------------------|--------------------------------| +| `BOOLEAN` | `bool` | `get_boolean()` | `set_field(idx, bool)` | +| `TINYINT` | `i8` | `get_byte()` | `set_field(idx, i8)` | +| `SMALLINT` | `i16` | `get_short()` | `set_field(idx, i16)` | +| `INT` | `i32` | `get_int()` | `set_field(idx, i32)` | +| `BIGINT` | `i64` | `get_long()` | `set_field(idx, i64)` | +| `FLOAT` | `f32` | `get_float()` | `set_field(idx, f32)` | +| `DOUBLE` | `f64` | `get_double()` | `set_field(idx, f64)` | +| `CHAR` | `&str` | `get_char(idx, length)` | `set_field(idx, &str)` | +| `STRING` | `&str` | `get_string()` | `set_field(idx, &str)` | +| `DECIMAL` | `Decimal` | `get_decimal(idx, precision, scale)` | `set_field(idx, Decimal)` | +| `DATE` | `Date` | `get_date()` | `set_field(idx, Date)` | +| `TIME` | `Time` | `get_time()` | `set_field(idx, Time)` | +| `TIMESTAMP` | `TimestampNtz` | `get_timestamp_ntz(idx, precision)` | `set_field(idx, TimestampNtz)` | +| `TIMESTAMP_LTZ` | `TimestampLtz` | `get_timestamp_ltz(idx, precision)` | `set_field(idx, TimestampLtz)` | +| `BYTES` | `&[u8]` | `get_bytes()` | `set_field(idx, &[u8])` | +| `BINARY(n)` | `&[u8]` | `get_binary(idx, length)` | `set_field(idx, &[u8])` | + +## Constructing Special Types + +Primitive types (`bool`, `i8`, `i16`, `i32`, `i64`, `f32`, `f64`, `&str`, `&[u8]`) can be passed directly to `set_field`. The following types require explicit construction: + +```rust +use fluss::row::{Date, Time, TimestampNtz, TimestampLtz, Decimal}; + +// Date: days since Unix epoch +let date = Date::new(19738); + +// Time: milliseconds since midnight +let time = Time::new(43200000); + +// Timestamp without timezone: milliseconds since epoch +let ts = TimestampNtz::new(1704067200000); + +// Timestamp with local timezone: milliseconds since epoch +let ts_ltz = TimestampLtz::new(1704067200000); + +// Decimal: from an unscaled long value with precision and scale +let decimal = Decimal::from_unscaled_long(12345, 10, 2)?; // represents 123.45 +``` + +## Creating Rows from Data + +`GenericRow::from_data` accepts a `Vec`. Because multiple crates implement `From<&str>`, Rust cannot infer the target type from `.into()` alone. Annotate the vector type explicitly: + +```rust +use fluss::row::{Datum, GenericRow}; + +let data: Vec = vec![1i32.into(), "hello".into()]; +let row = GenericRow::from_data(data); +``` diff --git a/website/docs/user-guide/rust/error-handling.md b/website/docs/user-guide/rust/error-handling.md new file mode 100644 index 00000000..35ede6c8 --- /dev/null +++ b/website/docs/user-guide/rust/error-handling.md @@ -0,0 +1,180 @@ +--- +sidebar_position: 4 +--- +# Error Handling + +The Fluss Rust client uses a unified `Error` type and a `Result` alias for all fallible operations. + +## Basic Usage + +```rust +use fluss::error::{Error, Result}; + +// All operations return Result +let conn = FlussConnection::new(config).await?; +let admin = conn.get_admin().await?; +let table = conn.get_table(&table_path).await?; +``` + +Use the `?` operator to propagate errors, or `match` on specific variants for fine-grained handling. + +## Matching Error Variants + +```rust +use fluss::error::Error; + +match result { + Ok(val) => { + // handle success + } + Err(Error::RpcError { message, .. }) => { + eprintln!("RPC failure: {}", message); + } + Err(Error::UnsupportedOperation { message }) => { + eprintln!("Unsupported: {}", message); + } + Err(Error::FlussAPIError { api_error }) => { + eprintln!("Server error: {}", api_error); + } + Err(e) => { + eprintln!("Unexpected error: {}", e); + } +} +``` + +## Error Variants + +| Variant | Description | +|--------------------------------|--------------------------------------------------------------| +| `UnexpectedError` | General unexpected errors with a message and optional source | +| `IoUnexpectedError` | I/O errors (network, file system) | +| `RemoteStorageUnexpectedError` | Remote storage errors (OpenDAL backend failures) | +| `RpcError` | RPC communication failures (connection refused, timeout) | +| `RowConvertError` | Row conversion failures (type mismatch, invalid data) | +| `ArrowError` | Arrow data handling errors (schema mismatch, encoding) | +| `IllegalArgument` | Invalid arguments passed to an API method | +| `UnsupportedOperation` | Operation not supported on the table type | +| `FlussAPIError` | Server-side API errors returned by the Fluss cluster | + +Server side errors are represented as `FlussAPIError` with a specific error code. Use the `api_error()` helper to match them ergonomically: + +```rust +use fluss::error::FlussError; + +match result { + Err(ref e) if e.api_error() == Some(FlussError::InvalidTableException) => { + eprintln!("Invalid table: {}", e); + } + Err(ref e) if e.api_error() == Some(FlussError::PartitionNotExists) => { + eprintln!("Partition does not exist: {}", e); + } + Err(ref e) if e.api_error() == Some(FlussError::LeaderNotAvailableException) => { + eprintln!("Leader not available: {}", e); + } + _ => {} +} +``` + +## Common Error Scenarios + +### Connection Refused + +The Fluss cluster is not running or the address is incorrect. + +```rust +let result = FlussConnection::new(config).await; +match result { + Err(Error::RpcError { message, .. }) => { + eprintln!("Cannot connect to cluster: {}", message); + } + _ => {} +} +``` + +### Table Not Found + +The table does not exist or has been dropped. + +```rust +use fluss::error::{Error, FlussError}; + +// Admin operations return FlussError::TableNotExist (code 7) +let result = admin.drop_table(&table_path, false).await; +match result { + Err(ref e) if e.api_error() == Some(FlussError::TableNotExist) => { + eprintln!("Table not found: {}", e); + } + _ => {} +} + +// conn.get_table() wraps the error differently, match on FlussAPIError directly +let result = conn.get_table(&table_path).await; +match result { + Err(Error::FlussAPIError { ref api_error }) => { + eprintln!("Server error (code {}): {}", api_error.code, api_error.message); + } + _ => {} +} +``` + +### Partition Not Found + +The partition does not exist on a partitioned table. + +```rust +use fluss::error::FlussError; + +let result = admin.drop_partition(&table_path, &spec, false).await; +match result { + Err(ref e) if e.api_error() == Some(FlussError::PartitionNotExists) => { + eprintln!("Partition does not exist: {}", e); + } + _ => {} +} +``` + +### Schema Mismatch + +Row data does not match the expected table schema. + +```rust +let result = writer.append(&row); +match result { + Err(Error::RowConvertError { .. }) => { + eprintln!("Row does not match table schema"); + } + _ => {} +} +``` + +## Using `Result` in Application Code + +The `fluss::error::Result` type alias makes it easy to use Fluss errors with the `?` operator in your application functions: + +```rust +use fluss::error::Result; + +async fn my_pipeline() -> Result<()> { + let conn = FlussConnection::new(config).await?; + let admin = conn.get_admin().await?; + let table = conn.get_table(&table_path).await?; + let writer = table.new_append()?.create_writer()?; + writer.append(&row)?; + writer.flush().await?; + Ok(()) +} +``` + +For applications that use other error types alongside Fluss errors, you can convert with standard `From` / `Into` traits or use crates like `anyhow`: + +```rust +use anyhow::Result; + +#[tokio::main] +async fn main() -> Result<()> { + let conn = FlussConnection::new(config).await?; + // fluss::error::Error implements std::error::Error, + // so it converts into anyhow::Error automatically + Ok(()) +} +``` diff --git a/website/docs/user-guide/rust/example/_category_.json b/website/docs/user-guide/rust/example/_category_.json new file mode 100644 index 00000000..dd222949 --- /dev/null +++ b/website/docs/user-guide/rust/example/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Example", + "position": 5 +} diff --git a/website/docs/user-guide/rust/example/admin-operations.md b/website/docs/user-guide/rust/example/admin-operations.md new file mode 100644 index 00000000..7fcc4017 --- /dev/null +++ b/website/docs/user-guide/rust/example/admin-operations.md @@ -0,0 +1,118 @@ +--- +sidebar_position: 3 +--- +# Admin Operations + +## Get Admin Interface + +```rust +let admin = conn.get_admin().await?; +``` + +## Database Operations + +```rust +// Create database +admin.create_database("my_database", None, true).await?; + +// List all databases +let databases = admin.list_databases().await?; +println!("Databases: {:?}", databases); + +// Check if database exists +let exists = admin.database_exists("my_database").await?; + +// Get database information +let db_info = admin.get_database_info("my_database").await?; + +// Drop database +admin.drop_database("my_database", true, false).await?; +``` + +## Table Operations + +```rust +use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; + +let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("id", DataTypes::int()) + .column("name", DataTypes::string()) + .column("amount", DataTypes::bigint()) + .build()?, + ) + .build()?; + +let table_path = TablePath::new("my_database", "my_table"); + +// Create table +admin.create_table(&table_path, &table_descriptor, true).await?; + +// Get table information +let table_info = admin.get_table_info(&table_path).await?; +println!("Table: {}", table_info); + +// List tables in database +let tables = admin.list_tables("my_database").await?; + +// Check if table exists +let exists = admin.table_exists(&table_path).await?; + +// Drop table +admin.drop_table(&table_path, true).await?; +``` + +## Partition Operations + +```rust +use fluss::metadata::PartitionSpec; +use std::collections::HashMap; + +// List all partitions +let partitions = admin.list_partition_infos(&table_path).await?; + +// List partitions matching a spec +let mut filter = HashMap::new(); +filter.insert("year", "2024"); +let spec = PartitionSpec::new(filter); +let partitions = admin.list_partition_infos_with_spec(&table_path, Some(&spec)).await?; + +// Create partition +admin.create_partition(&table_path, &spec, true).await?; + +// Drop partition +admin.drop_partition(&table_path, &spec, true).await?; +``` + +## Offset Operations + +```rust +use fluss::rpc::message::OffsetSpec; + +let bucket_ids = vec![0, 1, 2]; + +// Get earliest offsets +let earliest = admin.list_offsets(&table_path, &bucket_ids, OffsetSpec::Earliest).await?; + +// Get latest offsets +let latest = admin.list_offsets(&table_path, &bucket_ids, OffsetSpec::Latest).await?; + +// Get offsets for a specific timestamp +let timestamp_ms = 1704067200000; // 2024-01-01 00:00:00 UTC +let offsets = admin.list_offsets( + &table_path, &bucket_ids, OffsetSpec::Timestamp(timestamp_ms), +).await?; + +// Get offsets for a specific partition +let partition_offsets = admin.list_partition_offsets( + &table_path, "partition_name", &bucket_ids, OffsetSpec::Latest, +).await?; +``` + +## Lake Snapshot + +```rust +let snapshot = admin.get_latest_lake_snapshot(&table_path).await?; +println!("Snapshot ID: {}", snapshot.snapshot_id); +``` diff --git a/website/docs/user-guide/rust/example/configuration.md b/website/docs/user-guide/rust/example/configuration.md new file mode 100644 index 00000000..82f536fb --- /dev/null +++ b/website/docs/user-guide/rust/example/configuration.md @@ -0,0 +1,24 @@ +--- +sidebar_position: 2 +--- +# Configuration + +```rust +use fluss::client::FlussConnection; +use fluss::config::Config; + +let mut config = Config::default(); +config.bootstrap_servers = "127.0.0.1:9123".to_string(); + +let conn = FlussConnection::new(config).await?; +``` + +## Configuration Options + +| Option | Description | Default | +|---------------------------|-------------------------------------------------------|------------------| +| `bootstrap_servers` | Coordinator server address | `127.0.0.1:9123` | +| `writer_request_max_size` | Maximum request size in bytes | 10 MB | +| `writer_acks` | Acknowledgment setting (`all` waits for all replicas) | `all` | +| `writer_retries` | Number of retries on failure | `i32::MAX` | +| `writer_batch_size` | Batch size for writes | 2 MB | diff --git a/website/docs/user-guide/rust/example/index.md b/website/docs/user-guide/rust/example/index.md new file mode 100644 index 00000000..dcee87b0 --- /dev/null +++ b/website/docs/user-guide/rust/example/index.md @@ -0,0 +1,56 @@ +--- +sidebar_position: 1 +--- +# Example + +Minimal working examples: connect to Fluss, create a table, write data, and read it back. + +```rust +use fluss::client::FlussConnection; +use fluss::config::Config; +use fluss::error::Result; +use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; +use fluss::row::{GenericRow, InternalRow}; +use std::time::Duration; + +#[tokio::main] +async fn main() -> Result<()> { + // Connect + let mut config = Config::default(); + config.bootstrap_servers = "127.0.0.1:9123".to_string(); + let conn = FlussConnection::new(config).await?; + let admin = conn.get_admin().await?; + + // Create a log table + let table_path = TablePath::new("fluss", "quickstart_rust"); + let descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("id", DataTypes::int()) + .column("name", DataTypes::string()) + .build()?, + ) + .build()?; + admin.create_table(&table_path, &descriptor, true).await?; + + // Write + let table = conn.get_table(&table_path).await?; + let writer = table.new_append()?.create_writer()?; + let mut row = GenericRow::new(2); + row.set_field(0, 1); + row.set_field(1, "hello"); + writer.append(&row)?; + writer.flush().await?; + + // Read + let scanner = table.new_scan().create_log_scanner()?; + scanner.subscribe(0, 0).await?; + let records = scanner.poll(Duration::from_secs(5)).await?; + for record in records { + let row = record.row(); + println!("id={}, name={}", row.get_int(0), row.get_string(1)); + } + + Ok(()) +} +``` diff --git a/website/docs/user-guide/rust/example/log-tables.md b/website/docs/user-guide/rust/example/log-tables.md new file mode 100644 index 00000000..3ba33542 --- /dev/null +++ b/website/docs/user-guide/rust/example/log-tables.md @@ -0,0 +1,140 @@ +--- +sidebar_position: 4 +--- +# Log Tables + +Log tables are append-only tables without primary keys, suitable for event streaming. + +## Creating a Log Table + +```rust +use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; + +let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("event_id", DataTypes::int()) + .column("event_type", DataTypes::string()) + .column("timestamp", DataTypes::bigint()) + .build()?, + ) + .build()?; + +let table_path = TablePath::new("fluss", "events"); +admin.create_table(&table_path, &table_descriptor, true).await?; +``` + +## Writing to Log Tables + +```rust +use fluss::row::{GenericRow, InternalRow}; + +let table = conn.get_table(&table_path).await?; +let append_writer = table.new_append()?.create_writer()?; + +let mut row = GenericRow::new(3); +row.set_field(0, 1); // event_id +row.set_field(1, "user_login"); // event_type +row.set_field(2, 1704067200000i64); // timestamp + +append_writer.append(&row)?; +append_writer.flush().await?; +``` + +Write operations use a **fire-and-forget** pattern for efficient batching. Each call queues the write and returns a `WriteResultFuture` immediately. Call `flush()` to ensure all queued writes are sent to the server. + +For per-record acknowledgment: + +```rust +append_writer.append(&row)?.await?; +``` + +## Reading from Log Tables + +```rust +use std::time::Duration; + +let table = conn.get_table(&table_path).await?; +let log_scanner = table.new_scan().create_log_scanner()?; + +// Subscribe to bucket 0 starting from offset 0 +log_scanner.subscribe(0, 0).await?; + +// Poll for records +let records = log_scanner.poll(Duration::from_secs(10)).await?; + +for record in records { + let row = record.row(); + println!( + "event_id={}, event_type={}, timestamp={} @ offset={}", + row.get_int(0), + row.get_string(1), + row.get_long(2), + record.offset() + ); +} +``` + +**Subscribe from special offsets:** + +```rust +use fluss::client::EARLIEST_OFFSET; + +log_scanner.subscribe(0, EARLIEST_OFFSET).await?; // from earliest +log_scanner.subscribe(0, 42).await?; // from specific offset +``` + +**Subscribe from latest offset (only new records):** + +To start reading only new records, first resolve the current latest offset via `list_offsets`, then subscribe at that offset: + +```rust +use fluss::rpc::message::OffsetSpec; + +let admin = conn.get_admin().await?; +let offsets = admin.list_offsets(&table_path, &[0], OffsetSpec::Latest).await?; +let latest = offsets[&0]; +log_scanner.subscribe(0, latest).await?; +``` + +**Subscribe to all buckets:** + +```rust +let num_buckets = table.get_table_info().get_num_buckets(); +for bucket_id in 0..num_buckets { + log_scanner.subscribe(bucket_id, 0).await?; +} +``` + +**Subscribe to multiple buckets at once:** + +```rust +use std::collections::HashMap; + +let mut bucket_offsets = HashMap::new(); +bucket_offsets.insert(0, 0i64); +bucket_offsets.insert(1, 100i64); +log_scanner.subscribe_buckets(&bucket_offsets).await?; +``` + +**Unsubscribe from a bucket:** + +```rust +// Non-partitioned tables +log_scanner.unsubscribe(bucket_id).await?; + +// Partitioned tables +log_scanner.unsubscribe_partition(partition_id, bucket_id).await?; +``` + +## Column Projection + +```rust +// Project by column index +let scanner = table.new_scan().project(&[0, 2])?.create_log_scanner()?; + +// Project by column name +let scanner = table.new_scan() + .project_by_name(&["event_id", "timestamp"])? + .create_log_scanner()?; +``` diff --git a/website/docs/user-guide/rust/example/partitioned-tables.md b/website/docs/user-guide/rust/example/partitioned-tables.md new file mode 100644 index 00000000..3edf4d88 --- /dev/null +++ b/website/docs/user-guide/rust/example/partitioned-tables.md @@ -0,0 +1,215 @@ +--- +sidebar_position: 6 +--- +# Partitioned Tables + +Partitioned tables distribute data across partitions based on partition column values, enabling efficient data organization and querying. Both log tables and primary key tables support partitioning. + +## Partitioned Log Tables + +### Creating a Partitioned Log Table + +```rust +use fluss::metadata::{DataTypes, LogFormat, Schema, TableDescriptor, TablePath}; + +let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("event_id", DataTypes::int()) + .column("event_type", DataTypes::string()) + .column("dt", DataTypes::string()) + .column("region", DataTypes::string()) + .build()?, + ) + .partitioned_by(vec!["dt", "region"]) + .log_format(LogFormat::ARROW) + .build()?; + +let table_path = TablePath::new("fluss", "partitioned_events"); +admin.create_table(&table_path, &table_descriptor, true).await?; +``` + +### Writing to Partitioned Log Tables + +**Partitions must exist before writing data, otherwise the client will by default retry indefinitely.** Include partition column values in each row, the client routes records to the correct partition automatically. + +```rust +use fluss::metadata::PartitionSpec; +use std::collections::HashMap; + +let table = conn.get_table(&table_path).await?; + +// Create the partition before writing +let mut partition_values = HashMap::new(); +partition_values.insert("dt", "2024-01-15"); +partition_values.insert("region", "US"); +admin.create_partition(&table_path, &PartitionSpec::new(partition_values), true).await?; + +let append_writer = table.new_append()?.create_writer()?; + +let mut row = GenericRow::new(4); +row.set_field(0, 1); // event_id +row.set_field(1, "user_login"); // event_type +row.set_field(2, "2024-01-15"); // dt (partition column) +row.set_field(3, "US"); // region (partition column) + +append_writer.append(&row)?; +append_writer.flush().await?; +``` + +### Reading from Partitioned Log Tables + +For partitioned tables, use partition-aware subscribe methods. + +```rust +use std::time::Duration; + +let table = conn.get_table(&table_path).await?; +let admin = conn.get_admin().await?; +let partitions = admin.list_partition_infos(&table_path).await?; + +let log_scanner = table.new_scan().create_log_scanner()?; + +// Subscribe to each partition's buckets +for partition_info in &partitions { + let partition_id = partition_info.get_partition_id(); + let num_buckets = table.get_table_info().get_num_buckets(); + for bucket_id in 0..num_buckets { + log_scanner.subscribe_partition(partition_id, bucket_id, 0).await?; + } +} + +let records = log_scanner.poll(Duration::from_secs(10)).await?; +for record in records { + println!("Record: {:?}", record.row()); +} +``` + +Subscribe to multiple partition-buckets at once: + +```rust +use std::collections::HashMap; + +let mut partition_bucket_offsets = HashMap::new(); +partition_bucket_offsets.insert((partition_id, 0), 0i64); +partition_bucket_offsets.insert((partition_id, 1), 0i64); +log_scanner.subscribe_partition_buckets(&partition_bucket_offsets).await?; +``` + +### Managing Partitions + +```rust +use fluss::metadata::PartitionSpec; +use std::collections::HashMap; + +// Create a partition +let mut partition_values = HashMap::new(); +partition_values.insert("dt", "2024-01-15"); +partition_values.insert("region", "EMEA"); +let spec = PartitionSpec::new(partition_values); +admin.create_partition(&table_path, &spec, true).await?; + +// List all partitions +let partitions = admin.list_partition_infos(&table_path).await?; +for partition in &partitions { + println!( + "Partition: id={}, name={}", + partition.get_partition_id(), + partition.get_partition_name() + ); +} + +// List with filter +let mut partial_values = HashMap::new(); +partial_values.insert("dt", "2024-01-15"); +let partial_spec = PartitionSpec::new(partial_values); +let filtered = admin.list_partition_infos_with_spec( + &table_path, Some(&partial_spec), +).await?; + +// Drop a partition +admin.drop_partition(&table_path, &spec, true).await?; +``` + +## Partitioned Primary Key Tables + +Partitioned KV tables combine partitioning with primary key operations. Partition columns must be part of the primary key. + +### Creating a Partitioned Primary Key Table + +```rust +use fluss::metadata::{DataTypes, KvFormat, Schema, TableDescriptor, TablePath}; + +let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("user_id", DataTypes::int()) + .column("region", DataTypes::string()) + .column("zone", DataTypes::bigint()) + .column("score", DataTypes::bigint()) + .primary_key(vec!["user_id", "region", "zone"]) + .build()?, + ) + .partitioned_by(vec!["region", "zone"]) + .kv_format(KvFormat::COMPACTED) + .build()?; + +let table_path = TablePath::new("fluss", "partitioned_users"); +admin.create_table(&table_path, &table_descriptor, true).await?; +``` + +### Writing to Partitioned Primary Key Tables + +**Partitions must exist before upserting data, otherwise the client will by default retry indefinitely.** + +```rust +use fluss::metadata::PartitionSpec; +use std::collections::HashMap; + +let table = conn.get_table(&table_path).await?; + +// Create partitions first +for (region, zone) in [("APAC", "1"), ("EMEA", "2"), ("US", "3")] { + let mut values = HashMap::new(); + values.insert("region", region); + values.insert("zone", zone); + admin.create_partition(&table_path, &PartitionSpec::new(values), true).await?; +} + +let table_upsert = table.new_upsert()?; +let upsert_writer = table_upsert.create_writer()?; + +for (user_id, region, zone, score) in [ + (1001, "APAC", 1i64, 1234i64), + (1002, "EMEA", 2, 2234), + (1003, "US", 3, 3234), +] { + let mut row = GenericRow::new(4); + row.set_field(0, user_id); + row.set_field(1, region); + row.set_field(2, zone); + row.set_field(3, score); + upsert_writer.upsert(&row)?; +} +upsert_writer.flush().await?; +``` + +### Looking Up Records in Partitioned Tables + +Lookup requires all primary key columns including partition columns. + +```rust +let mut lookuper = table.new_lookup()?.create_lookuper()?; + +let mut key = GenericRow::new(3); +key.set_field(0, 1001); // user_id +key.set_field(1, "APAC"); // region (partition column) +key.set_field(2, 1i64); // zone (partition column) + +let result = lookuper.lookup(&key).await?; +if let Some(row) = result.get_single_row()? { + println!("Found: score={}", row.get_long(3)); +} +``` + +> **Note:** Scanning partitioned primary key tables is not supported. Use lookup operations instead. diff --git a/website/docs/user-guide/rust/example/primary-key-tables.md b/website/docs/user-guide/rust/example/primary-key-tables.md new file mode 100644 index 00000000..5b299cca --- /dev/null +++ b/website/docs/user-guide/rust/example/primary-key-tables.md @@ -0,0 +1,114 @@ +--- +sidebar_position: 5 +--- +# Primary Key Tables + +Primary key tables (KV tables) support upsert, delete, and lookup operations. + +## Creating a Primary Key Table + +```rust +use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; + +let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("id", DataTypes::int()) + .column("name", DataTypes::string()) + .column("age", DataTypes::bigint()) + .primary_key(vec!["id"]) + .build()?, + ) + .build()?; + +let table_path = TablePath::new("fluss", "users"); +admin.create_table(&table_path, &table_descriptor, true).await?; +``` + +## Upserting Records + +```rust +use fluss::row::{GenericRow, InternalRow}; + +let table = conn.get_table(&table_path).await?; +let table_upsert = table.new_upsert()?; +let upsert_writer = table_upsert.create_writer()?; + +for (id, name, age) in [(1, "Alice", 25i64), (2, "Bob", 30), (3, "Charlie", 35)] { + let mut row = GenericRow::new(3); + row.set_field(0, id); + row.set_field(1, name); + row.set_field(2, age); + upsert_writer.upsert(&row)?; +} +upsert_writer.flush().await?; +``` + +## Updating Records + +Upsert with the same primary key to update an existing record. + +```rust +let mut row = GenericRow::new(3); +row.set_field(0, 1); // id (primary key) +row.set_field(1, "Alice"); +row.set_field(2, 26i64); // updated age + +upsert_writer.upsert(&row)?; +upsert_writer.flush().await?; +``` + +## Deleting Records + +```rust +// Only primary key field needs to be set +let mut row = GenericRow::new(3); +row.set_field(0, 2); // id of record to delete + +upsert_writer.delete(&row)?; +upsert_writer.flush().await?; +``` + +## Partial Updates + +Update only specific columns while preserving others. + +```rust +// By column indices +let partial_upsert = table_upsert.partial_update(Some(vec![0, 2]))?; +let partial_writer = partial_upsert.create_writer()?; + +let mut row = GenericRow::new(3); +row.set_field(0, 1); // id (primary key, required) +row.set_field(2, 27i64); // age (will be updated) +// name will remain unchanged + +partial_writer.upsert(&row)?; +partial_writer.flush().await?; + +// By column names +let partial_upsert = table_upsert.partial_update_with_column_names(&["id", "age"])?; +let partial_writer = partial_upsert.create_writer()?; +``` + +## Looking Up Records + +```rust +let mut lookuper = table.new_lookup()?.create_lookuper()?; + +let mut key = GenericRow::new(1); +key.set_field(0, 1); // id to lookup + +let result = lookuper.lookup(&key).await?; + +if let Some(row) = result.get_single_row()? { + println!( + "Found: id={}, name={}, age={}", + row.get_int(0), + row.get_string(1), + row.get_long(2) + ); +} else { + println!("Record not found"); +} +``` diff --git a/website/docs/user-guide/rust/installation.md b/website/docs/user-guide/rust/installation.md new file mode 100644 index 00000000..e6987831 --- /dev/null +++ b/website/docs/user-guide/rust/installation.md @@ -0,0 +1,76 @@ +--- +sidebar_position: 1 +--- +# Installation + +The Fluss Rust client is published to [crates.io](https://crates.io/crates/fluss-rs) as `fluss-rs`. The crate's library name is `fluss`, so you import it with `use fluss::...`. + +```toml +[dependencies] +fluss-rs = "0.1" +tokio = { version = "1", features = ["full"] } +``` + +## Feature Flags + +```toml +[dependencies] +# Default: memory and filesystem storage +fluss-rs = "0.1" + +# With S3 storage support +fluss-rs = { version = "0.1", features = ["storage-s3"] } + +# With OSS storage support +fluss-rs = { version = "0.1", features = ["storage-oss"] } + +# All storage backends +fluss-rs = { version = "0.1", features = ["storage-all"] } +``` + +Available features: +- `storage-memory` (default: In-memory storage) +- `storage-fs` (default: Local filesystem storage) +- `storage-s3` (Amazon S3 storage) +- `storage-oss` (Alibaba OSS storage) +- `storage-all` (All storage backends) + +## Git or Path Dependency + +For development against unreleased changes: + +```toml +[dependencies] +# From Git +fluss = { git = "https://github.com/apache/fluss-rust.git", package = "fluss-rs" } + +# From local path +fluss = { path = "/path/to/fluss-rust/crates/fluss", package = "fluss-rs" } +``` + +> **Note:** When using `git` or `path` dependencies, the `package = "fluss-rs"` field is required so that Cargo resolves the correct package while still allowing `use fluss::...` imports. + +## Building from Source + +**Prerequisites:** Rust 1.85+, Protobuf compiler (`protoc`) + +```bash +git clone https://github.com/apache/fluss-rust.git +cd fluss-rust +``` + +Install `protoc`: + +```bash +# macOS +brew install protobuf + +# Ubuntu/Debian +sudo apt-get install protobuf-compiler +``` + +Build: + +```bash +cargo build --workspace --all-targets +``` diff --git a/website/docusaurus.config.ts b/website/docusaurus.config.ts new file mode 100644 index 00000000..0d974e95 --- /dev/null +++ b/website/docusaurus.config.ts @@ -0,0 +1,84 @@ +import {themes as prismThemes} from 'prism-react-renderer'; +import type {Config} from '@docusaurus/types'; +import type * as Preset from '@docusaurus/preset-classic'; + +const config: Config = { + title: 'Apache Fluss Clients', + tagline: 'Rust, Python, and C++ clients for Apache Fluss', + favicon: 'img/logo/fluss_favicon.svg', + + url: 'https://fluss.apache.org/', + baseUrl: '/fluss-rust/', + + organizationName: 'apache', + projectName: 'fluss-rust', + + onBrokenLinks: 'throw', + + i18n: { + defaultLocale: 'en', + locales: ['en'], + }, + + presets: [ + [ + 'classic', + { + docs: { + routeBasePath: '/', + sidebarPath: './sidebars.ts', + editUrl: 'https://github.com/apache/fluss-rust/edit/main/website/', + }, + blog: false, + theme: { + customCss: './src/css/custom.css', + }, + } satisfies Preset.Options, + ], + ], + + themeConfig: { + image: 'img/logo/png/colored_logo.png', + colorMode: { + defaultMode: 'light', + disableSwitch: true, + }, + navbar: { + title: '', + logo: { + alt: 'Fluss', + src: 'img/logo/svg/colored_logo.svg', + }, + items: [ + { + type: 'docSidebar', + sidebarId: 'docsSidebar', + position: 'left', + label: 'Client Docs', + }, + { + href: 'https://fluss.apache.org/', + label: 'Fluss', + position: 'left', + }, + { + href: 'https://github.com/apache/fluss-rust', + position: 'right', + className: 'header-github-link', + 'aria-label': 'GitHub repository', + }, + ], + }, + footer: { + style: 'dark', + copyright: `Copyright © ${new Date().getFullYear()} The Apache Software Foundation, Licensed under the Apache License, Version 2.0.`, + }, + prism: { + theme: prismThemes.vsDark, + darkTheme: prismThemes.dracula, + additionalLanguages: ['rust', 'toml', 'bash', 'cmake'], + }, + } satisfies Preset.ThemeConfig, +}; + +export default config; diff --git a/website/package.json b/website/package.json new file mode 100644 index 00000000..644a7051 --- /dev/null +++ b/website/package.json @@ -0,0 +1,43 @@ +{ + "name": "fluss-clients-website", + "version": "0.0.0", + "private": true, + "scripts": { + "docusaurus": "docusaurus", + "start": "docusaurus start", + "build": "docusaurus build", + "swizzle": "docusaurus swizzle", + "clear": "docusaurus clear", + "serve": "docusaurus serve" + }, + "dependencies": { + "@docusaurus/core": "^3.9.2", + "@docusaurus/preset-classic": "^3.9.2", + "@mdx-js/react": "^3.0.0", + "clsx": "^2.0.0", + "prism-react-renderer": "^2.3.0", + "react": "^18.0.0", + "react-dom": "^18.0.0" + }, + "devDependencies": { + "@docusaurus/module-type-aliases": "^3.9.2", + "@docusaurus/tsconfig": "^3.9.2", + "@docusaurus/types": "^3.9.2", + "typescript": "~5.5.2" + }, + "browserslist": { + "production": [ + ">0.5%", + "not dead", + "not op_mini all" + ], + "development": [ + "last 3 chrome version", + "last 3 firefox version", + "last 5 safari version" + ] + }, + "engines": { + "node": ">=20.0" + } +} diff --git a/website/sidebars.ts b/website/sidebars.ts new file mode 100644 index 00000000..97f33802 --- /dev/null +++ b/website/sidebars.ts @@ -0,0 +1,24 @@ +import type {SidebarsConfig} from '@docusaurus/plugin-content-docs'; + +const sidebars: SidebarsConfig = { + docsSidebar: [ + 'index', + { + type: 'category', + label: 'User Guide', + items: [ + {type: 'autogenerated', dirName: 'user-guide'}, + ], + }, + { + type: 'category', + label: 'Developer Guide', + items: [ + 'developer-guide/contributing', + 'developer-guide/release', + ], + }, + ], +}; + +export default sidebars; diff --git a/website/src/css/custom.css b/website/src/css/custom.css new file mode 100644 index 00000000..9143372f --- /dev/null +++ b/website/src/css/custom.css @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Copied from the main fluss.apache.org website (fluss/website/src/css/custom.css) + * to ensure visual consistency. + */ + +/* Import Inter font from Google Fonts */ +@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap'); + +/* You can override the default Infima variables here. */ +:root { + --ifm-color-primary: #0071e3; + --ifm-color-primary-dark: #0066cc; + --ifm-color-primary-darker: #0060c1; + --ifm-color-primary-darkest: #004f9f; + --ifm-color-primary-light: #007cfa; + --ifm-color-primary-lighter: #0682ff; + --ifm-color-primary-lightest: #2893ff; + --ifm-code-font-size: 90%; + --ifm-font-family-base: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, 'PingFang SC', 'Hiragino Sans GB', 'Microsoft YaHei', sans-serif; + --docusaurus-highlighted-code-line-bg: #E2E9F3; + + --ifm-menu-color-background-active: #edeefa99; + --ifm-menu-color-background-hover: #edeefa99; +} + + +.navbar__brand { + font-family: monaco; + color: inherit; +} + +.header-github-link:hover { + opacity: 0.6; +} + +.header-github-link::before { + content: ''; + width: 24px; + height: 24px; + display: flex; + background: url("data:image/svg+xml,%3Csvg viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath d='M12 .297c-6.63 0-12 5.373-12 12 0 5.303 3.438 9.8 8.205 11.385.6.113.82-.258.82-.577 0-.285-.01-1.04-.015-2.04-3.338.724-4.042-1.61-4.042-1.61C4.422 18.07 3.633 17.7 3.633 17.7c-1.087-.744.084-.729.084-.729 1.205.084 1.838 1.236 1.838 1.236 1.07 1.835 2.809 1.305 3.495.998.108-.776.417-1.305.76-1.605-2.665-.3-5.466-1.332-5.466-5.93 0-1.31.465-2.38 1.235-3.22-.135-.303-.54-1.523.105-3.176 0 0 1.005-.322 3.3 1.23.96-.267 1.98-.399 3-.405 1.02.006 2.04.138 3 .405 2.28-1.552 3.285-1.23 3.285-1.23.645 1.653.24 2.873.12 3.176.765.84 1.23 1.91 1.23 3.22 0 4.61-2.805 5.625-5.475 5.92.42.36.81 1.096.81 2.22 0 1.606-.015 2.896-.015 3.286 0 .315.21.69.825.57C20.565 22.092 24 17.592 24 12.297c0-6.627-5.373-12-12-12'/%3E%3C/svg%3E") + no-repeat; +} + + +.menu__list-item { + font-size: 0.95rem; + font-weight: 500; +} + +.menu__link--sublist-caret:after { + background: var(--ifm-menu-link-sublist-icon) 50% / 1.5rem 1.5rem; +} + + +.markdown { + padding-left: 1rem; + h1, + h2, + h3, + h4, + h5, + h6 { + color: #1d1d1d; + margin-bottom: 0.3125rem; + font-weight: 700; + } + + b, + strong { + font-weight: 700; + color: #1d1d1d; + } + + h1, + h1:first-child { + font-size: 2.5rem; + margin-bottom: 1.5rem; + margin-top: 0; + } + + h2 { + font-size: 2rem; + margin-bottom: 1.25rem; + margin-top: 2rem; + padding-top: 2rem; + border-top: 1px solid #e6e7e9; + } + + h3 { + font-size: 1.5rem; + margin-bottom: 1.25rem; + margin-top: 1rem; + } + p { + line-height: 1.875; + + code { + border-radius: 4px; + background-color: #edf2fa; + border: none; + padding: 3px 4px; + font-size: 14px; + color: #4c576c; + } + } + + li > code { + border-radius: 4px; + background-color: #edf2fa; + border: none; + padding: 3px 4px; + font-size: 14px; + color: #4c576c; + } + + table thead tr { + background-color: #f7f9fe; + } + + table thead th { + background-color: #f7f9fe; + color: #1d1d1d; + font-size: 1rem; + font-weight: 500; + } + + table tr { + border-bottom: none; + background-color: var(--global-colors-white); + font-size: var(--global-font-size-small); + + code { + border-radius: 4px; + background-color: #edf2fa; + border: none; + padding: 3px 4px; + font-size: 14px; + color: #4c576c; + } + } + + table tr th { + padding: 0.53rem 0.8125rem; + border-color: #dfe5f0; + } + + table tr td { + padding: 0.65rem 0.8125rem; + border-color: #dfe5f0; + } + a { + color: var(--ifm-color-primary); + } + ul { + padding-left: 20px; + li { + margin-top: 4px; + position: relative; + list-style: initial; + } + } + ol { + padding-left: 20px; + li { + list-style: decimal; + } + } +} + +.theme-doc-markdown { + header { + margin-top: 1rem; + + & + h1 { + display: none; + } + } +} + +.breadcrumbs__item--active .breadcrumbs__link { + background: var(--ifm-menu-color-background-active); +} + +.footer__copyright { + color: #dfe5f0; + font-size: .75rem; + line-height: 1.8; + opacity: .6; + text-align: center; + width: 98%; +} diff --git a/website/static/img/logo/fluss_favicon.svg b/website/static/img/logo/fluss_favicon.svg new file mode 100644 index 00000000..7c044d55 --- /dev/null +++ b/website/static/img/logo/fluss_favicon.svg @@ -0,0 +1,19 @@ + + + \ No newline at end of file diff --git a/website/static/img/logo/png/colored_logo.png b/website/static/img/logo/png/colored_logo.png new file mode 100644 index 00000000..2cd7dd37 Binary files /dev/null and b/website/static/img/logo/png/colored_logo.png differ diff --git a/website/static/img/logo/svg/colored_logo.svg b/website/static/img/logo/svg/colored_logo.svg new file mode 100644 index 00000000..3b136ac4 --- /dev/null +++ b/website/static/img/logo/svg/colored_logo.svg @@ -0,0 +1,19 @@ + + + \ No newline at end of file diff --git a/website/tsconfig.json b/website/tsconfig.json new file mode 100644 index 00000000..d250afae --- /dev/null +++ b/website/tsconfig.json @@ -0,0 +1,6 @@ +{ + "extends": "@docusaurus/tsconfig", + "compilerOptions": { + "baseUrl": "." + } +}