diff --git a/.github/workflows/typos.yml b/.github/workflows/typos.yml new file mode 100644 index 0000000000..4f9f13dae2 --- /dev/null +++ b/.github/workflows/typos.yml @@ -0,0 +1,13 @@ +name: Typo checker +on: [pull_request] + +jobs: + run: + name: Spell Check with Typos + runs-on: "ubuntu-24.04" + steps: + - name: Checkout Actions Repository + uses: actions/checkout@v4 + + - name: Check spelling of the entire repository + uses: crate-ci/typos@v1.26.0 \ No newline at end of file diff --git a/.typos.toml b/.typos.toml new file mode 100644 index 0000000000..9b142594f9 --- /dev/null +++ b/.typos.toml @@ -0,0 +1,10 @@ +[default.extend-words] +DNE = "DNE" +arange = "arange" +nd = "nd" +terrestial = "terrestial" +abd = "abd" +afe = "afe" + +[files] +extend-exclude = ["notebooks/*.ipynb"] \ No newline at end of file diff --git a/benchmarks/flat/benchmark.py b/benchmarks/flat/benchmark.py index a56fbf04ca..7d7cf36446 100755 --- a/benchmarks/flat/benchmark.py +++ b/benchmarks/flat/benchmark.py @@ -30,9 +30,9 @@ def benchmark( dim: int, metric: str, ): - querys = [np.random.random((dim,)).reshape(-1) for _ in range(32)] + queries = [np.random.random((dim,)).reshape(-1) for _ in range(32)] # warmup - for query in querys: + for query in queries: ds.to_table( nearest={"column": "vector", "k": 10, "q": query, "use_index": False} ) @@ -40,7 +40,7 @@ def benchmark( latency = [] for _ in range(10): - for query in querys: + for query in queries: start = time.perf_counter() ds.to_table( nearest={ diff --git a/benchmarks/full_report/report.ipynb b/benchmarks/full_report/report.ipynb index 039776ab62..11a4924c4c 100644 --- a/benchmarks/full_report/report.ipynb +++ b/benchmarks/full_report/report.ipynb @@ -2435,7 +2435,7 @@ } ], "source": [ - "# test NYT -- TF-IDF sparse vectors projected on to 256D dense -- normlized L2\n", + "# test NYT -- TF-IDF sparse vectors projected on to 256D dense -- normalized L2\n", "data = _get_nyt_vectors()\n", "data = data[np.linalg.norm(data, axis=1) != 0]\n", "data = np.unique(data, axis=0)\n", diff --git a/benchmarks/sift/README.md b/benchmarks/sift/README.md index 666fb05e18..2ab70dc49a 100644 --- a/benchmarks/sift/README.md +++ b/benchmarks/sift/README.md @@ -5,7 +5,7 @@ Dataset URI: http://corpus-texmex.irisa.fr/ The SIFT/GIST-1M benchmarks make use of the [LanceDB](https://github.com/lancedb/lancedb) API to index, manage and query the datasets. Ensure the dependencies are installed. LanceDB is built on top of Lance and stores everything as Lance datasets. ```sh -# Pin the lancedb version to the latest one availale on your own benchmark +# Pin the lancedb version to the latest one available on your own benchmark pip lancedb==0.3.6 pip install pandas~=2.1.0 pip duckdb~=0.9.0 diff --git a/benchmarks/sift/gt.py b/benchmarks/sift/gt.py index a7f165fba2..b305fce263 100755 --- a/benchmarks/sift/gt.py +++ b/benchmarks/sift/gt.py @@ -34,7 +34,7 @@ def generate_gt(args): col = args.col or infer_vector_column(ds) if col is None: raise ValueError( - "Can not infer vector column, please specifiy the column explicitly" + "Can not infer vector column, please specify the column explicitly" ) samples = ds.sample(args.samples, columns=[col])[col] diff --git a/benchmarks/sift/perf.py b/benchmarks/sift/perf.py index e00446c793..2fed623605 100644 --- a/benchmarks/sift/perf.py +++ b/benchmarks/sift/perf.py @@ -77,9 +77,9 @@ def summary(self): series = [] for k, v in self._configs.items(): timer = self._timers[k] - config_ser = pd.Series(v) - time_ser = timer.summary() - series.append(pd.concat([config_ser, time_ser])) + config_series = pd.Series(v) + time_series = timer.summary() + series.append(pd.concat([config_series, time_series])) return pd.DataFrame(series) diff --git a/docs/examples/llm_dataset_creation.rst b/docs/examples/llm_dataset_creation.rst index dc72ca17b4..dd4922af8a 100644 --- a/docs/examples/llm_dataset_creation.rst +++ b/docs/examples/llm_dataset_creation.rst @@ -3,7 +3,7 @@ Creating text dataset for LLM training using Lance Lance can be used for creating and caching a text (or code) dataset for pre-training / fine-tuning of Large Language Models. The need for this arises when one needs to train a model on a subset of data or process the data in chunks without downloading -all of it on the disk at once. This becomes a considerable problem when you just want a subset of a Terrabyte or Petabyte-scale dataset. +all of it on the disk at once. This becomes a considerable problem when you just want a subset of a Terabyte or Petabyte-scale dataset. In this example, we will be bypassing this problem by downloading a text dataset in parts, tokenizing it and saving it as a Lance dataset. This can be done for as many or as few data samples as you wish with average memory consumption approximately 3-4 GBs! @@ -41,7 +41,7 @@ Now we will define a function to help us with tokenizing our samples, one-by-one def tokenize(sample, field='text'): return tokenizer(sample[field])['input_ids'] -This function will recieve a sample from a huggingface dataset and tokenize the values in the `field` column. This is the main text you want +This function will receive a sample from a huggingface dataset and tokenize the values in the `field` column. This is the main text you want to tokenize. Creating a Lance dataset @@ -70,7 +70,7 @@ let's define the main function that takes in the dataset, number of samples and ) This function will be iterating over the huggingface dataset, one sample at a time, tokenizing the sample and yielding a pyarrow `RecordBatch` -with all the tokens. We will do this untill we have reached the `num_samples` number of samples or the end of the dataset, whichever comes first. +with all the tokens. We will do this until we have reached the `num_samples` number of samples or the end of the dataset, whichever comes first. Please note that by 'sample', we mean one example (row) in the original dataset. What one example exactly means will depend on the dataset itself as it could be one line or an entire file of text. In this example, it's varies in length between a line and a paragraph of text. diff --git a/docs/examples/llm_training.rst b/docs/examples/llm_training.rst index 835a4a16ff..dec16fd182 100644 --- a/docs/examples/llm_training.rst +++ b/docs/examples/llm_training.rst @@ -9,7 +9,7 @@ In this example, we will be training an LLM using 🤗 transformers on the token Imports and Setup ~~~~~~~~~~~~~~~~~ -Let's setup our enviornment by doing all the necessary imports and defining a few basic things. +Let's setup our environment by doing all the necessary imports and defining a few basic things. .. code-block:: python diff --git a/docs/format.rst b/docs/format.rst index 75afa4abef..13cfbc2712 100644 --- a/docs/format.rst +++ b/docs/format.rst @@ -108,7 +108,7 @@ The following values are supported: - 0.16.0 - Any - Rework of the Lance file format that removed row groups and introduced null - support for lists, fixed size lists, and primtives + support for lists, fixed size lists, and primitives * - 2.1 (unstable) - None - Any diff --git a/docs/performance.rst b/docs/performance.rst index 1fd27e8ede..2684a3d234 100644 --- a/docs/performance.rst +++ b/docs/performance.rst @@ -7,7 +7,7 @@ Threading Model --------------- Lance is designed to be thread-safe and performant. Lance APIs can be called concurrently unless -explicity stated otherwise. Users may create multiple tables and share tables between threads. +explicitly stated otherwise. Users may create multiple tables and share tables between threads. Operations may run in parallel on the same table, but some operations may lead to conflicts. For details see :ref:`conflict_resolution`. @@ -80,4 +80,4 @@ with 1024 rows per batch is more appropriate. In summary, scans could use up to ``(2 * io_buffer_size) + (batch_size * num_compute_threads)`` bytes of memory. Keep in mind that ``io_buffer_size`` is a soft limit (e.g. we cannot read less than one page at a time right now) -and so it is not neccesarily a bug if you see memory usage exceed this limit by a small margin. \ No newline at end of file +and so it is not necessarily a bug if you see memory usage exceed this limit by a small margin. \ No newline at end of file diff --git a/java/core/lance-jni/src/blocking_dataset.rs b/java/core/lance-jni/src/blocking_dataset.rs index 470ea7b228..e8c2c94cf3 100644 --- a/java/core/lance-jni/src/blocking_dataset.rs +++ b/java/core/lance-jni/src/blocking_dataset.rs @@ -302,8 +302,8 @@ fn attach_native_dataset<'local>( } fn create_java_dataset_object<'a>(env: &mut JNIEnv<'a>) -> Result> { - let objet = env.new_object("com/lancedb/lance/Dataset", "()V", &[])?; - Ok(objet) + let object = env.new_object("com/lancedb/lance/Dataset", "()V", &[])?; + Ok(object) } #[no_mangle] diff --git a/java/core/lance-jni/src/blocking_scanner.rs b/java/core/lance-jni/src/blocking_scanner.rs index e4458f62d4..17247fb82c 100644 --- a/java/core/lance-jni/src/blocking_scanner.rs +++ b/java/core/lance-jni/src/blocking_scanner.rs @@ -121,7 +121,7 @@ fn inner_create_scanner<'local>( let mut scanner = dataset_guard.inner.scan(); - // handle frament_ids + // handle fragment_ids if let Some(fragment_ids) = fragment_ids_opt { let mut fragments = Vec::with_capacity(fragment_ids.len()); for fragment_id in fragment_ids { diff --git a/protos/file2.proto b/protos/file2.proto index a4bbf24583..7bc1f2c9f0 100644 --- a/protos/file2.proto +++ b/protos/file2.proto @@ -49,7 +49,7 @@ import "google/protobuf/empty.proto"; // // If direct I/O is required then most (but not all) fields described // below must be sector aligned. We have marked these fields with an -// asterick for clarity. Readers should assume there will be optional +// asterisk for clarity. Readers should assume there will be optional // padding inserted before these fields. // // All footer fields are unsigned integers written with little endian @@ -96,7 +96,7 @@ import "google/protobuf/empty.proto"; // // ## Data Pages // -// A lot of flexiblity is provided in how data is stored. Note that the file +// A lot of flexibility is provided in how data is stored. Note that the file // layout has no explicit notion of a page (however, it is a part of the column // metadata). A page's buffers do not strictly need to be contiguous on the // disk. However, it is recommended that buffers within a page be grouped diff --git a/python/python/benchmarks/test_index.py b/python/python/benchmarks/test_index.py index 1c96e14fa5..0c6c41295f 100644 --- a/python/python/benchmarks/test_index.py +++ b/python/python/benchmarks/test_index.py @@ -163,7 +163,7 @@ def test_train_ivf(test_large_dataset, benchmark, num_partitions): ) -# Pre-computing partition assigment only makes sense on CUDA and so this benchmark runs +# Pre-computing partition assignment only makes sense on CUDA and so this benchmark runs # only on CUDA. @pytest.mark.benchmark(group="assign_partitions") @pytest.mark.parametrize("num_partitions", [100, 300]) diff --git a/python/python/lance/_arrow/bf16.py b/python/python/lance/_arrow/bf16.py index 4fe156cf6d..9ecd361183 100644 --- a/python/python/lance/_arrow/bf16.py +++ b/python/python/lance/_arrow/bf16.py @@ -105,12 +105,12 @@ def __init__(self): pa.ExtensionType.__init__(self, pa.binary(2), "lance.bfloat16") def __arrow_ext_serialize__(self): - # TODO: encode endianess + # TODO: encode endianness return b"" @classmethod def __arrow_ext_deserialize__(self, storage_type, serialized): - # TODO: decode endianess + # TODO: decode endianness return BFloat16Type() def __arrow_ext_class__(self): diff --git a/python/python/lance/_dataset/sharded_batch_iterator.py b/python/python/lance/_dataset/sharded_batch_iterator.py index 68d14c1e73..0bf26efe8a 100644 --- a/python/python/lance/_dataset/sharded_batch_iterator.py +++ b/python/python/lance/_dataset/sharded_batch_iterator.py @@ -21,7 +21,7 @@ class ShardedBatchIterator: """An iterator of RecordBatches, over the sharded dataset. - Parmeters + Parameters --------- uri: str or Path Dataset base URI diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 9cafa518e1..7a3e91a711 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -938,7 +938,7 @@ def add_columns( The names of the columns that the UDF will read. If None, then the UDF will read all columns. This is only used when transforms is a UDF. Otherwise, the read columns are inferred from the SQL expressions. - reader_scheam: pa.Schema, optional + reader_schema: pa.Schema, optional Only valid if transforms is a `ReaderLike` object. This will be used to determine the schema of the reader. batch_size: int, optional diff --git a/python/python/lance/ray/sink.py b/python/python/lance/ray/sink.py index ea3b92e715..7cde7eada1 100644 --- a/python/python/lance/ray/sink.py +++ b/python/python/lance/ray/sink.py @@ -351,7 +351,7 @@ def __call__(self, batch: Union[pa.Table, "pd.DataFrame"]) -> Dict[str, Any]: class LanceCommitter(_BaseLanceDatasink): - """Lance Commiter as Ray Datasink. + """Lance Committer as Ray Datasink. This is used with `LanceFragmentWriter` to write large-than-memory data to lance file. @@ -362,7 +362,7 @@ def num_rows_per_write(self) -> int: return 1 def get_name(self) -> str: - return f"LanceCommiter({self.mode})" + return f"LanceCommitter({self.mode})" def write( self, diff --git a/python/python/tests/test_optimize.py b/python/python/tests/test_optimize.py index e1668e6d6f..fecfc72d7d 100644 --- a/python/python/tests/test_optimize.py +++ b/python/python/tests/test_optimize.py @@ -130,7 +130,7 @@ def test_compact_with_write(tmp_path: Path): # This test creates a dataset with a manifest containing fragments # that are not in sorted order (by id) # - # We do this by runnign compaction concurrently with append + # We do this by running compaction concurrently with append # # This is because compaction first reserves a fragment id. Then the # concurrent writes grab later ids and commit them. Then the compaction diff --git a/python/python/tests/test_ray.py b/python/python/tests/test_ray.py index 03f9253ae6..b85f185aff 100644 --- a/python/python/tests/test_ray.py +++ b/python/python/tests/test_ray.py @@ -17,7 +17,7 @@ _register_hooks, ) -# Use this hook until we have offical DataSink in Ray. +# Use this hook until we have official DataSink in Ray. _register_hooks() ray.init() diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 6d3f1f9168..92d0b0efb1 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -1702,7 +1702,7 @@ fn prepare_vector_index_params( }; if let Some(f) = kwargs.get_item("precomputed_partitions_file")? { - ivf_params.precomputed_partitons_file = Some(f.to_string()); + ivf_params.precomputed_partitions_file = Some(f.to_string()); }; if let Some(storage_options) = storage_options { diff --git a/rust/lance-core/src/datatypes.rs b/rust/lance-core/src/datatypes.rs index 6acb9ed936..219f263f22 100644 --- a/rust/lance-core/src/datatypes.rs +++ b/rust/lance-core/src/datatypes.rs @@ -262,7 +262,7 @@ impl TryFrom<&LogicalType> for DataType { "dict" => { if splits.len() != 4 { Err(Error::Schema { - message: format!("Unsupport dictionary type: {}", lt), + message: format!("Unsupported dictionary type: {}", lt), location: location!(), }) } else { @@ -274,7 +274,7 @@ impl TryFrom<&LogicalType> for DataType { "decimal" => { if splits.len() != 4 { Err(Error::Schema { - message: format!("Unsupport decimal type: {}", lt), + message: format!("Unsupported decimal type: {}", lt), location: location!(), }) } else { diff --git a/rust/lance-core/src/utils/testing.rs b/rust/lance-core/src/utils/testing.rs index 87cbb4d33d..9746787f71 100644 --- a/rust/lance-core/src/utils/testing.rs +++ b/rust/lance-core/src/utils/testing.rs @@ -57,7 +57,7 @@ pub struct ProxyObjectStorePolicy { /// be returned instead. before_policies: HashMap, /// Policies which run after calls that return ObjectMeta. The policy can - /// tranform the returned ObjectMeta to mock out file listing results. + /// transform the returned ObjectMeta to mock out file listing results. object_meta_policies: HashMap, } diff --git a/rust/lance-datafusion/src/exec.rs b/rust/lance-datafusion/src/exec.rs index 0ec78dfc29..e7b46d33bc 100644 --- a/rust/lance-datafusion/src/exec.rs +++ b/rust/lance-datafusion/src/exec.rs @@ -35,14 +35,14 @@ use log::{debug, info, warn}; /// An source execution node created from an existing stream /// /// It can only be used once, and will return the stream. After that the node -/// is exhuasted. +/// is exhausted. /// /// Note: the stream should be finite, otherwise we will report datafusion properties /// incorrectly. pub struct OneShotExec { stream: Mutex>, // We save off a copy of the schema to speed up formatting and so ExecutionPlan::schema & display_as - // can still function after exhuasted + // can still function after exhausted schema: Arc, properties: PlanProperties, } @@ -91,7 +91,7 @@ impl DisplayAs for OneShotExec { let stream = self.stream.lock().unwrap(); match t { DisplayFormatType::Default | DisplayFormatType::Verbose => { - let exhausted = if stream.is_some() { "" } else { "EXHUASTED " }; + let exhausted = if stream.is_some() { "" } else { "EXHAUSTED" }; let columns = self .schema .field_names() diff --git a/rust/lance-datafusion/src/logical_expr.rs b/rust/lance-datafusion/src/logical_expr.rs index 46c1a64feb..ebfb73ea03 100644 --- a/rust/lance-datafusion/src/logical_expr.rs +++ b/rust/lance-datafusion/src/logical_expr.rs @@ -287,7 +287,7 @@ pub mod tests { #[test] fn test_resolve_in_expr() { - // Type coersion should apply for `A IN (0)` or `A NOT IN (0)` + // Type coercion should apply for `A IN (0)` or `A NOT IN (0)` let arrow_schema = ArrowSchema::new(vec![Field::new("a", DataType::Float32, false)]); let expr = Expr::in_list( Expr::Column("a".to_string().into()), diff --git a/rust/lance-datafusion/src/substrait.rs b/rust/lance-datafusion/src/substrait.rs index 7c86479622..57cffb1261 100644 --- a/rust/lance-datafusion/src/substrait.rs +++ b/rust/lance-datafusion/src/substrait.rs @@ -350,7 +350,7 @@ pub async fn parse_substrait(expr: &[u8], input_schema: Arc) -> Result { diff --git a/rust/lance-datagen/src/generator.rs b/rust/lance-datagen/src/generator.rs index 81286c1182..288c011d20 100644 --- a/rust/lance-datagen/src/generator.rs +++ b/rust/lance-datagen/src/generator.rs @@ -470,9 +470,9 @@ impl ArrayGenerator for CycleVectorGenerator { } #[derive(Default)] -pub struct PseduoUuidGenerator {} +pub struct PseudoUuidGenerator {} -impl ArrayGenerator for PseduoUuidGenerator { +impl ArrayGenerator for PseudoUuidGenerator { fn generate( &mut self, length: RowCount, @@ -497,9 +497,9 @@ impl ArrayGenerator for PseduoUuidGenerator { } #[derive(Default)] -pub struct PseduoUuidHexGenerator {} +pub struct PseudoUuidHexGenerator {} -impl ArrayGenerator for PseduoUuidHexGenerator { +impl ArrayGenerator for PseudoUuidHexGenerator { fn generate( &mut self, length: RowCount, @@ -1581,8 +1581,8 @@ pub mod array { /// Note, these are "pseudo UUIDs". They are 16-byte randomish values but they /// are not guaranteed to be unique. We use a simplistic RNG that trades uniqueness /// for speed. - pub fn rand_pseduo_uuid() -> Box { - Box::::default() + pub fn rand_pseudo_uuid() -> Box { + Box::::default() } /// Create a generator of random UUIDs, stored as 32-character strings (hex encoding @@ -1591,8 +1591,8 @@ pub mod array { /// Note, these are "pseudo UUIDs". They are 16-byte randomish values but they /// are not guaranteed to be unique. We use a simplistic RNG that trades uniqueness /// for speed. - pub fn rand_pseduo_uuid_hex() -> Box { - Box::::default() + pub fn rand_pseudo_uuid_hex() -> Box { + Box::::default() } pub fn rand_primitive( diff --git a/rust/lance-encoding/src/compression_algo/fsst/src/fsst.rs b/rust/lance-encoding/src/compression_algo/fsst/src/fsst.rs index 2cdbba27e5..664f955694 100644 --- a/rust/lance-encoding/src/compression_algo/fsst/src/fsst.rs +++ b/rust/lance-encoding/src/compression_algo/fsst/src/fsst.rs @@ -387,7 +387,7 @@ impl SymbolTable { // rationale for finalize: // - during symbol table construction, we may create more than 256 codes, but bring it down to max 255 in the last makeTable() - // consequently we needed more than 8 bits during symbol table contruction, but can simplify the codes to single bytes in finalize() + // consequently we needed more than 8 bits during symbol table construction, but can simplify the codes to single bytes in finalize() // (this feature is in fact lo longer used, but could still be exploited: symbol construction creates no more than 255 symbols in each pass) // - we not only reduce the amount of codes to <255, but also *reorder* the symbols and renumber their codes, for higher compression perf. // we renumber codes so they are grouped by length, to allow optimized scalar string compression (byteLim and suffixLim optimizations). diff --git a/rust/lance-encoding/src/data.rs b/rust/lance-encoding/src/data.rs index a6baf9d6c4..4291bd48b9 100644 --- a/rust/lance-encoding/src/data.rs +++ b/rust/lance-encoding/src/data.rs @@ -31,7 +31,7 @@ use lance_core::{Error, Result}; use crate::{buffer::LanceBuffer, statistics::Stat}; -/// `Encoding` enum serves as a encoding registeration center. +/// `Encoding` enum serves as a encoding registration center. /// /// All the encodings added to Lance should register here, and /// these encodings can be dynamically selected during encoding, @@ -891,9 +891,9 @@ fn arrow_binary_to_data_block( num_values: u64, bits_per_offset: u8, ) -> DataBlock { - let datas = arrays.iter().map(|arr| arr.to_data()).collect::>(); + let data_vec = arrays.iter().map(|arr| arr.to_data()).collect::>(); let bytes_per_offset = bits_per_offset as usize / 8; - let offsets = datas + let offsets = data_vec .iter() .map(|d| { LanceBuffer::Borrowed( @@ -908,7 +908,7 @@ fn arrow_binary_to_data_block( } else { stitch_offsets::(offsets) }; - let data = datas + let data = data_vec .iter() .zip(data_ranges) .map(|(d, byte_range)| { @@ -1222,11 +1222,11 @@ impl DataBlock { let structs = arrays.iter().map(|arr| arr.as_struct()).collect::>(); let mut children = Vec::with_capacity(fields.len()); for child_idx in 0..fields.len() { - let childs = structs + let child_vec = structs .iter() .map(|s| s.column(child_idx).clone()) .collect::>(); - children.push(Self::from_arrays(&childs, num_values)); + children.push(Self::from_arrays(&child_vec, num_values)); } Self::Struct(StructDataBlock { children }) } diff --git a/rust/lance-encoding/src/decoder.rs b/rust/lance-encoding/src/decoder.rs index cefab2cb9b..3fab7791be 100644 --- a/rust/lance-encoding/src/decoder.rs +++ b/rust/lance-encoding/src/decoder.rs @@ -73,7 +73,7 @@ //! - values: Value (physical encoding) //! - items: Primitive (logical encoding) //! - column: Basic (physical encoding) -//! - values: Value (phsyical encoding) +//! - values: Value (physical encoding) //! //! Note that, in this example, root.items.column does not have a validity because there were //! no nulls in the page. @@ -433,7 +433,7 @@ impl<'a> DecoderMiddlewareChainCursor<'a> { &self.io } - /// Delegates responsibilty to the next encoder in the chain + /// Delegates responsibility to the next encoder in the chain /// /// Field schedulers should call this method when: /// @@ -616,7 +616,7 @@ impl<'a> ColumnInfoIter<'a> { pub trait FieldDecoderStrategy: Send + Sync + std::fmt::Debug { /// Called to create a field scheduler for a field /// - /// Stratgies can examine: + /// Strategies can examine: /// * The target field /// * The column metadata (potentially consuming multiple columns) /// @@ -1286,7 +1286,7 @@ pub struct BatchDecodeStream { rows_per_batch: u32, rows_scheduled: u64, rows_drained: u64, - scheduler_exhuasted: bool, + scheduler_exhausted: bool, emitted_batch_size_warning: Arc, } @@ -1297,7 +1297,7 @@ impl BatchDecodeStream { /// /// * `scheduled` - an incoming stream of decode tasks from a /// [`crate::decode::DecodeBatchScheduler`] - /// * `schema` - the scheam of the data to create + /// * `schema` - the schema of the data to create /// * `rows_per_batch` the number of rows to create before making a batch /// * `num_rows` the total number of rows scheduled /// * `num_columns` the total number of columns in the file @@ -1314,7 +1314,7 @@ impl BatchDecodeStream { rows_per_batch, rows_scheduled: 0, rows_drained: 0, - scheduler_exhuasted: false, + scheduler_exhausted: false, emitted_batch_size_warning: Arc::new(Once::new()), } } @@ -1329,7 +1329,7 @@ impl BatchDecodeStream { } async fn wait_for_scheduled(&mut self, scheduled_need: u64) -> Result { - if self.scheduler_exhuasted { + if self.scheduler_exhausted { return Ok(self.rows_scheduled); } while self.rows_scheduled < scheduled_need { @@ -1346,7 +1346,7 @@ impl BatchDecodeStream { // Schedule ended before we got all the data we expected. This probably // means some kind of pushdown filter was applied and we didn't load as // much data as we thought we would. - self.scheduler_exhuasted = true; + self.scheduler_exhausted = true; return Ok(self.rows_scheduled); } } @@ -1889,8 +1889,8 @@ impl FilterExpression { /// cover many columns of child data. In fact, the entire file is treated as one /// top-level struct field. /// -/// The scheduler is responsible for calculating the neccesary I/O. One schedule_range -/// request could trigger mulitple batches of I/O across multiple columns. The scheduler +/// The scheduler is responsible for calculating the necessary I/O. One schedule_range +/// request could trigger multiple batches of I/O across multiple columns. The scheduler /// should emit decoders into the sink as quickly as possible. /// /// As soon as the scheduler encounters a batch of data that can decoded then the scheduler diff --git a/rust/lance-encoding/src/encoder.rs b/rust/lance-encoding/src/encoder.rs index 4351c47a98..4d72407d02 100644 --- a/rust/lance-encoding/src/encoder.rs +++ b/rust/lance-encoding/src/encoder.rs @@ -286,7 +286,7 @@ impl Default for EncodedColumn { /// As a result, most encoders should not need to use this structure. /// /// In some cases (currently only the large binary encoding) there is a need to access -/// buffers that are not in the page (becuase storing the position / offset of every page +/// buffers that are not in the page (because storing the position / offset of every page /// in the page metadata would be too expensive). /// /// To do this you can add a buffer with `add_buffer` and then use the returned position diff --git a/rust/lance-encoding/src/encodings/logical/blob.rs b/rust/lance-encoding/src/encodings/logical/blob.rs index 36b8b6881d..6d6b7f982a 100644 --- a/rust/lance-encoding/src/encodings/logical/blob.rs +++ b/rust/lance-encoding/src/encodings/logical/blob.rs @@ -29,7 +29,7 @@ use crate::{ /// A field scheduler for large binary data /// -/// Large binary data (1MiB+) can be inefficient if we store as a regular primtive. We +/// Large binary data (1MiB+) can be inefficient if we store as a regular primitive. We /// essentially end up with 1 page per row (or a few rows) and the overhead of the /// metadata can be significant. /// diff --git a/rust/lance-encoding/src/encodings/logical/list.rs b/rust/lance-encoding/src/encodings/logical/list.rs index 54f94693a6..4abbcc480f 100644 --- a/rust/lance-encoding/src/encodings/logical/list.rs +++ b/rust/lance-encoding/src/encodings/logical/list.rs @@ -574,7 +574,7 @@ impl FieldScheduler for ListFieldScheduler { /// complete. /// /// Once the indirect I/O is finished we pull items out of `unawaited`, wait them -/// (this wait should return immedately) and then push them into `item_decoders`. +/// (this wait should return immediately) and then push them into `item_decoders`. /// /// We then drain from `item_decoders`, popping item pages off as we finish with /// them. @@ -1204,7 +1204,7 @@ impl FieldEncoder for ListFieldEncoder { // a limitation in the current scheduler and could be addressed in the future. As a result // we always need to encode the items page if we encode the offsets page. // - // In practice this isn't usually too bad unless we are targetting very small pages. + // In practice this isn't usually too bad unless we are targeting very small pages. item_tasks = self.items_encoder.flush(external_buffers)?; } Self::combine_tasks(offsets_tasks, item_tasks) diff --git a/rust/lance-encoding/src/encodings/logical/primitive.rs b/rust/lance-encoding/src/encodings/logical/primitive.rs index c59067124b..d23e1dc6b8 100644 --- a/rust/lance-encoding/src/encodings/logical/primitive.rs +++ b/rust/lance-encoding/src/encodings/logical/primitive.rs @@ -831,7 +831,7 @@ impl PrimitiveStructuralEncoder { // TODO: Parquet sparsely encodes values here. We could do the same but // then we won't have log2 values per chunk. This means more metadata - // and potentially more decoder assymetry. However, it may be worth + // and potentially more decoder asymmetry. However, it may be worth // investigating at some point let data = DataBlock::from_arrays(&arrays, num_values); diff --git a/rust/lance-encoding/src/encodings/physical/basic.rs b/rust/lance-encoding/src/encodings/physical/basic.rs index df69fc8536..6acd88f272 100644 --- a/rust/lance-encoding/src/encodings/physical/basic.rs +++ b/rust/lance-encoding/src/encodings/physical/basic.rs @@ -98,7 +98,7 @@ impl BasicPageScheduler { /// /// It may seem strange we need `values_decoder` here but Arrow requires that value /// buffers still be allocated / sized even if everything is null. So we need the value - /// decoder to calculate the capcity of the garbage buffer. + /// decoder to calculate the capacity of the garbage buffer. pub fn new_all_null() -> Self { Self { mode: SchedulerNullStatus::All, diff --git a/rust/lance-encoding/src/encodings/physical/bitmap.rs b/rust/lance-encoding/src/encodings/physical/bitmap.rs index 2f30a95beb..f36465c218 100644 --- a/rust/lance-encoding/src/encodings/physical/bitmap.rs +++ b/rust/lance-encoding/src/encodings/physical/bitmap.rs @@ -18,7 +18,7 @@ use crate::{ }; /// A physical scheduler for bitmap buffers encoded densely as 1 bit per value -/// with bit-endianess (e.g. what Arrow uses for validity bitmaps and boolean arrays) +/// with bit-endianness(e.g. what Arrow uses for validity bitmaps and boolean arrays) /// /// This decoder decodes from one buffer of disk data into one buffer of memory data #[derive(Debug, Clone, Copy)] diff --git a/rust/lance-encoding/src/encodings/physical/packed_struct.rs b/rust/lance-encoding/src/encodings/physical/packed_struct.rs index 78927caa4a..9647aa8541 100644 --- a/rust/lance-encoding/src/encodings/physical/packed_struct.rs +++ b/rust/lance-encoding/src/encodings/physical/packed_struct.rs @@ -192,7 +192,7 @@ impl ArrayEncoder for PackedStructEncoder { encoded_fields.push(encoder.encode(child, child_type.data_type(), &mut 0)?); } - let (encoded_datas, child_encodings): (Vec<_>, Vec<_>) = encoded_fields + let (encoded_data_vec, child_encodings): (Vec<_>, Vec<_>) = encoded_fields .into_iter() .map(|field| (field.data, field.encoding)) .unzip(); @@ -202,7 +202,7 @@ impl ArrayEncoder for PackedStructEncoder { // We can currently encode both FixedWidth and FixedSizeList. In order // to encode the latter we "flatten" it converting a FixedSizeList into // a FixedWidth with very wide items. - let fixed_fields = encoded_datas + let fixed_fields = encoded_data_vec .into_iter() .map(|child| match child { DataBlock::FixedWidth(fixed) => Ok(fixed), diff --git a/rust/lance-encoding/src/repdef.rs b/rust/lance-encoding/src/repdef.rs index 6c81a1d081..33e054d656 100644 --- a/rust/lance-encoding/src/repdef.rs +++ b/rust/lance-encoding/src/repdef.rs @@ -9,7 +9,7 @@ //! is irrelevant. //! //! Note: the concept of repetition & definition levels comes from the Dremel paper and has -//! been implemented in Apache Parquet. However, the implementation here is not neccesarily +//! been implemented in Apache Parquet. However, the implementation here is not necessarily //! compatible with Parquet. For example, we use 0 to represent the "inner-most" item and //! Parquet uses 0 to represent the "outer-most" item. //! @@ -434,7 +434,7 @@ impl RepDefBuilder { /// Starts with serialized repetition and definition levels and unravels /// them into validity buffers and offsets buffers /// -/// This is used during decoding to create the neccesary arrow structures +/// This is used during decoding to create the necessary arrow structures #[derive(Debug)] pub struct RepDefUnraveler { rep_levels: Option, diff --git a/rust/lance-file/src/v2/reader.rs b/rust/lance-file/src/v2/reader.rs index 59a4e092ae..88dad66847 100644 --- a/rust/lance-file/src/v2/reader.rs +++ b/rust/lance-file/src/v2/reader.rs @@ -898,7 +898,7 @@ impl FileReader { /// 2^31 bytes of string data (which is the maximum size of a string column /// in Arrow). In this case smaller batches may be emitted. /// * `batch_readahead` - The number of batches to read ahead. This controls the - /// amount of CPU parallelism of the read. In other words it controlls how many + /// amount of CPU parallelism of the read. In other words it controls how many /// batches will be decoded in parallel. It has no effect on the I/O parallelism /// of the read (how many I/O requests are in flight at once). /// diff --git a/rust/lance-file/src/writer.rs b/rust/lance-file/src/writer.rs index 84b807145e..e7b0ff59dc 100644 --- a/rust/lance-file/src/writer.rs +++ b/rust/lance-file/src/writer.rs @@ -207,7 +207,7 @@ impl FileWriter { Ok(()) } - /// Add schema metedata, as (key, value) pair to the file. + /// Add schema metadata, as (key, value) pair to the file. pub fn add_metadata(&mut self, key: &str, value: &str) { self.schema .metadata diff --git a/rust/lance-file/src/writer/statistics.rs b/rust/lance-file/src/writer/statistics.rs index 39425a407a..0ffd3d406d 100644 --- a/rust/lance-file/src/writer/statistics.rs +++ b/rust/lance-file/src/writer/statistics.rs @@ -1424,7 +1424,7 @@ mod tests { ), }, }, - // Sting is not incremented if it's exact lenght of the limit + // Sting is not incremented if it's exact length of the limit TestCase { source_arrays: vec![Arc::new(StringArray::from(vec![format!( "{}{}", @@ -1472,7 +1472,7 @@ mod tests { ))), }, }, - // Sting is not incremented if it's exact lenght of the limit + // Sting is not incremented if it's exact length of the limit TestCase { source_arrays: vec![Arc::new(LargeStringArray::from(vec![format!( "{}{}", diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index 79e94b4f19..27a86ea9f4 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -482,7 +482,7 @@ impl AnyQuery for LabelListQuery { pub trait ScalarIndex: Send + Sync + std::fmt::Debug + Index + DeepSizeOf { /// Search the scalar index /// - /// Returns all row ids that satisfy the query, these row ids are not neccesarily ordered + /// Returns all row ids that satisfy the query, these row ids are not necessarily ordered async fn search(&self, query: &dyn AnyQuery) -> Result; /// Load the scalar index from storage diff --git a/rust/lance-index/src/scalar/expression.rs b/rust/lance-index/src/scalar/expression.rs index 12ea97620d..f1423846aa 100644 --- a/rust/lance-index/src/scalar/expression.rs +++ b/rust/lance-index/src/scalar/expression.rs @@ -347,7 +347,7 @@ pub trait ScalarIndexLoader: Send + Sync { /// This represents a lookup into one or more scalar indices /// -/// This is a tree of operations beacause we may need to logically combine or +/// This is a tree of operations because we may need to logically combine or /// modify the results of scalar lookups #[derive(Debug, Clone)] pub enum ScalarIndexExpr { diff --git a/rust/lance-index/src/vector/flat/storage.rs b/rust/lance-index/src/vector/flat/storage.rs index 7173697233..8dd694385f 100644 --- a/rust/lance-index/src/vector/flat/storage.rs +++ b/rust/lance-index/src/vector/flat/storage.rs @@ -185,7 +185,7 @@ pub struct FlatDistanceCal<'a> { impl<'a> FlatDistanceCal<'a> { fn new(vectors: &'a FixedSizeListArray, query: ArrayRef, distance_type: DistanceType) -> Self { - // Gained sigificant performance improvement by using strong typed primtive slice. + // Gained significant performance improvement by using strong typed primitive slice. // TODO: to support other data types other than `f32`, make FlatDistanceCal a generic struct. let flat_array = vectors.values().as_primitive::(); let dimension = vectors.value_length() as usize; diff --git a/rust/lance-index/src/vector/ivf/builder.rs b/rust/lance-index/src/vector/ivf/builder.rs index 79d48eec17..475435460e 100644 --- a/rust/lance-index/src/vector/ivf/builder.rs +++ b/rust/lance-index/src/vector/ivf/builder.rs @@ -32,10 +32,10 @@ pub struct IvfBuildParams { /// Precomputed partitions file (row_id -> partition_id) /// mutually exclusive with `precomputed_shuffle_buffers` - pub precomputed_partitons_file: Option, + pub precomputed_partitions_file: Option, /// Precomputed shuffle buffers (row_id -> partition_id, pq_code) - /// mutually exclusive with `precomputed_partitons_file` + /// mutually exclusive with `precomputed_partitions_file` /// requires `centroids` to be set /// /// The input is expected to be (/dir/to/buffers, [buffer1.lance, buffer2.lance, ...]) @@ -59,7 +59,7 @@ impl Default for IvfBuildParams { max_iters: 50, centroids: None, sample_rate: 256, // See faiss - precomputed_partitons_file: None, + precomputed_partitions_file: None, precomputed_shuffle_buffers: None, shuffle_partition_batches: 1024 * 10, shuffle_partition_concurrency: 2, diff --git a/rust/lance-index/src/vector/ivf/shuffler.rs b/rust/lance-index/src/vector/ivf/shuffler.rs index f5998e4c3c..39258df3b4 100644 --- a/rust/lance-index/src/vector/ivf/shuffler.rs +++ b/rust/lance-index/src/vector/ivf/shuffler.rs @@ -723,7 +723,7 @@ impl IvfShuffler { continue; } - // the currnet part doesn't overlap with the current batch + // the current part doesn't overlap with the current batch if start >= cur_end { continue; } diff --git a/rust/lance-index/src/vector/kmeans.rs b/rust/lance-index/src/vector/kmeans.rs index 68af4e36ee..c971a6942b 100644 --- a/rust/lance-index/src/vector/kmeans.rs +++ b/rust/lance-index/src/vector/kmeans.rs @@ -36,7 +36,7 @@ where ),location: location!()}); } - // Ony sample sample_rate * num_clusters. See Faiss + // Only sample sample_rate * num_clusters. See Faiss let data = if num_rows > sample_rate * k { info!( "Sample {} out of {} to train kmeans of {} dim, {} clusters", diff --git a/rust/lance-index/src/vector/pq.rs b/rust/lance-index/src/vector/pq.rs index a2132caf0e..f6195ec0fb 100644 --- a/rust/lance-index/src/vector/pq.rs +++ b/rust/lance-index/src/vector/pq.rs @@ -18,7 +18,7 @@ use lance_linalg::kmeans::compute_partition; use num_traits::Float; use prost::Message; use snafu::{location, Location}; -use storage::{ProductQuantizationMetadata, ProductQuantizationStorage, PQ_METADTA_KEY}; +use storage::{ProductQuantizationMetadata, ProductQuantizationStorage, PQ_METADATA_KEY}; use tracing::instrument; pub mod builder; @@ -374,7 +374,7 @@ impl Quantization for ProductQuantizer { } fn metadata_key() -> &'static str { - PQ_METADTA_KEY + PQ_METADATA_KEY } fn quantization_type() -> QuantizationType { diff --git a/rust/lance-index/src/vector/pq/storage.rs b/rust/lance-index/src/vector/pq/storage.rs index fbe699250f..bab860fa48 100644 --- a/rust/lance-index/src/vector/pq/storage.rs +++ b/rust/lance-index/src/vector/pq/storage.rs @@ -48,7 +48,7 @@ use crate::{ IndexMetadata, INDEX_METADATA_SCHEMA_KEY, }; -pub const PQ_METADTA_KEY: &str = "lance:pq"; +pub const PQ_METADATA_KEY: &str = "lance:pq"; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ProductQuantizationMetadata { @@ -79,11 +79,11 @@ impl QuantizerMetadata for ProductQuantizationMetadata { let metadata = reader .schema() .metadata - .get(PQ_METADTA_KEY) + .get(PQ_METADATA_KEY) .ok_or(Error::Index { message: format!( "Reading PQ storage: metadata key {} not found", - PQ_METADTA_KEY + PQ_METADATA_KEY ), location: location!(), })?; @@ -101,9 +101,9 @@ impl QuantizerMetadata for ProductQuantizationMetadata { /// Product Quantization Storage /// -/// It stores PQ code, as well as the row ID to the orignal vectors. +/// It stores PQ code, as well as the row ID to the original vectors. /// -/// It is possible to store additonal metadata to accelerate filtering later. +/// It is possible to store additional metadata to accelerate filtering later. /// /// TODO: support f16/f64 later. #[derive(Clone, Debug)] @@ -336,7 +336,7 @@ impl ProductQuantizationStorage { let mut schema_metadata = HashMap::new(); schema_metadata.insert( - PQ_METADTA_KEY.to_string(), + PQ_METADATA_KEY.to_string(), serde_json::to_string(&metadata)?, ); schema_metadata.insert( diff --git a/rust/lance-index/src/vector/residual.rs b/rust/lance-index/src/vector/residual.rs index 009415e00e..b094e43d11 100644 --- a/rust/lance-index/src/vector/residual.rs +++ b/rust/lance-index/src/vector/residual.rs @@ -27,7 +27,7 @@ pub const RESIDUAL_COLUMN: &str = "__residual_vector"; /// #[derive(Clone)] pub struct ResidualTransform { - /// Flattend centroids. + /// Flattened centroids. centroids: FixedSizeListArray, /// Partition Column diff --git a/rust/lance-index/src/vector/sq/storage.rs b/rust/lance-index/src/vector/sq/storage.rs index beb83f5851..db14badc53 100644 --- a/rust/lance-index/src/vector/sq/storage.rs +++ b/rust/lance-index/src/vector/sq/storage.rs @@ -70,7 +70,7 @@ impl QuantizerMetadata for ScalarQuantizationMetadata { } } -/// An immutable chunk of SclarQuantizationStorage. +/// An immutable chunk of ScalarQuantizationStorage. #[derive(Debug, Clone)] struct SQStorageChunk { batch: RecordBatch, @@ -354,7 +354,7 @@ impl VectorStore for ScalarQuantizationStorage { /// Create a [DistCalculator] to compute the distance between the query. /// - /// Using dist calcualtor can be more efficient as it can pre-compute some + /// Using dist calculator can be more efficient as it can pre-compute some /// values. fn dist_calculator(&self, query: ArrayRef) -> Self::DistanceCalculator<'_> { SQDistCalculator::new(query, self, self.quantizer.bounds.clone()) diff --git a/rust/lance-index/src/vector/storage.rs b/rust/lance-index/src/vector/storage.rs index 2f2bb57328..92e12a752b 100644 --- a/rust/lance-index/src/vector/storage.rs +++ b/rust/lance-index/src/vector/storage.rs @@ -90,7 +90,7 @@ pub trait VectorStore: Send + Sync + Sized + Clone { /// Create a [DistCalculator] to compute the distance between the query. /// - /// Using dist calcualtor can be more efficient as it can pre-compute some + /// Using dist calculator can be more efficient as it can pre-compute some /// values. fn dist_calculator(&self, query: ArrayRef) -> Self::DistanceCalculator<'_>; diff --git a/rust/lance-index/src/vector/utils.rs b/rust/lance-index/src/vector/utils.rs index 7771eda550..1f5b3adca4 100644 --- a/rust/lance-index/src/vector/utils.rs +++ b/rust/lance-index/src/vector/utils.rs @@ -38,7 +38,7 @@ pub(crate) fn prefetch_arrow_array(array: &dyn Array) -> Result<()> { } _ => { return Err(Error::io( - format!("unsupport prefetch on {} type", array.data_type()), + format!("unsupported prefetch on {} type", array.data_type()), location!(), )); } diff --git a/rust/lance-index/src/vector/v3/shuffler.rs b/rust/lance-index/src/vector/v3/shuffler.rs index b35d2d4a3a..63233efd53 100644 --- a/rust/lance-index/src/vector/v3/shuffler.rs +++ b/rust/lance-index/src/vector/v3/shuffler.rs @@ -32,7 +32,7 @@ use crate::vector::PART_ID_COLUMN; pub trait ShuffleReader: Send + Sync { /// Read a partition by partition_id /// will return Ok(None) if partition_size is 0 - /// check reader.partiton_size(partition_id) before calling this function + /// check reader.partition_size(partition_id) before calling this function async fn read_partition( &self, partition_id: usize, diff --git a/rust/lance-io/src/encodings/binary.rs b/rust/lance-io/src/encodings/binary.rs index fe202c31ef..f8187a3717 100644 --- a/rust/lance-io/src/encodings/binary.rs +++ b/rust/lance-io/src/encodings/binary.rs @@ -556,7 +556,7 @@ mod tests { let path = temp_dir.path().join("foo"); let mut object_writer = tokio::fs::File::create(&path).await.unwrap(); - // Write some gabage to reset "tell()". + // Write some garbage to reset "tell()". object_writer.write_all(b"1234").await.unwrap(); let mut encoder = BinaryEncoder::new(&mut object_writer); let pos = encoder.encode(&[&data]).await.unwrap(); diff --git a/rust/lance-io/src/encodings/plain.rs b/rust/lance-io/src/encodings/plain.rs index 3cdb664e37..4f77fde5c7 100644 --- a/rust/lance-io/src/encodings/plain.rs +++ b/rust/lance-io/src/encodings/plain.rs @@ -767,7 +767,7 @@ mod tests { ); } - // Re-eanble the following tests once the Lance FileReader / FileWrite is migrated. + // Re-enable the following tests once the Lance FileReader / FileWrite is migrated. // #[tokio::test] // async fn test_boolean_slice() { diff --git a/rust/lance-io/src/object_store.rs b/rust/lance-io/src/object_store.rs index 41730cacc8..280c00dd69 100644 --- a/rust/lance-io/src/object_store.rs +++ b/rust/lance-io/src/object_store.rs @@ -146,7 +146,7 @@ const AWS_CREDS_CACHE_KEY: &str = "aws_credentials"; pub struct AwsCredentialAdapter { pub inner: Arc, - // RefCell can't be shared accross threads, so we use HashMap + // RefCell can't be shared across threads, so we use HashMap cache: Arc>>>, // The amount of time before expiry to refresh credentials diff --git a/rust/lance-io/src/scheduler.rs b/rust/lance-io/src/scheduler.rs index d164e4b020..b6cfff300a 100644 --- a/rust/lance-io/src/scheduler.rs +++ b/rust/lance-io/src/scheduler.rs @@ -39,7 +39,7 @@ const BACKPRESSURE_DEBOUNCE: u64 = 60; // // The process-wide limit exists when users need a hard limit on the number of parallel // IOPS, e.g. due to port availability limits or to prevent multiple scans from saturating -// the network. (Note: a process-wide limit of X will not neccesarily limit the number of +// the network. (Note: a process-wide limit of X will not necessarily limit the number of // open TCP connections to exactly X. The underlying object store may open more connections // anyways) // @@ -292,7 +292,7 @@ impl IoQueue { // Next, try and grab a reservation from the queue let mut state = self.state.lock().unwrap(); if let Some(task) = state.next_task() { - // Reservation sucessfully acquired, we will release the global + // Reservation successfully acquired, we will release the global // global reservation after task has run. iop_res.forget(); return Some(task); @@ -746,7 +746,7 @@ impl FileScheduler { /// Submit a single IOP to the reader /// - /// If you have multpile IOPS to perform then [`Self::submit_request`] is going + /// If you have multiple IOPS to perform then [`Self::submit_request`] is going /// to be more efficient. /// /// See [`Self::submit_request`] for more information on the priority and backpressure. diff --git a/rust/lance-linalg/src/clustering.rs b/rust/lance-linalg/src/clustering.rs index caa2a539f9..0bbfea3953 100644 --- a/rust/lance-linalg/src/clustering.rs +++ b/rust/lance-linalg/src/clustering.rs @@ -12,7 +12,7 @@ use crate::Result; /// Clustering Trait. pub trait Clustering { /// The dimension of the vectors. - fn deminsion(&self) -> u32; + fn dimension(&self) -> u32; /// The number of clusters. fn num_clusters(&self) -> u32; diff --git a/rust/lance-linalg/src/distance/cosine.rs b/rust/lance-linalg/src/distance/cosine.rs index 00d75a5fd5..9d1ce0a756 100644 --- a/rust/lance-linalg/src/distance/cosine.rs +++ b/rust/lance-linalg/src/distance/cosine.rs @@ -125,7 +125,7 @@ impl Cosine for f16 { mod f32 { use super::*; - // TODO: how can we explicity infer N? + // TODO: how can we explicitly infer N? #[inline] pub(super) fn cosine_once, const N: usize>( x: &[f32], diff --git a/rust/lance-linalg/src/kernels.rs b/rust/lance-linalg/src/kernels.rs index face3589bf..08ce5208bb 100644 --- a/rust/lance-linalg/src/kernels.rs +++ b/rust/lance-linalg/src/kernels.rs @@ -283,8 +283,8 @@ mod tests { assert_eq!(argmin(u.values().iter().copied()), Some(2)); let empty_vec: Vec = vec![]; - let emtpy = Int16Array::from(empty_vec); - assert_eq!(argmin_opt(emtpy.iter()), None) + let empty = Int16Array::from(empty_vec); + assert_eq!(argmin_opt(empty.iter()), None) } #[test] diff --git a/rust/lance-table/src/io/commit/dynamodb.rs b/rust/lance-table/src/io/commit/dynamodb.rs index cc608d51e0..4c1e87bd3e 100644 --- a/rust/lance-table/src/io/commit/dynamodb.rs +++ b/rust/lance-table/src/io/commit/dynamodb.rs @@ -91,18 +91,18 @@ where /// PK: base_uri -- string /// SK: version -- number /// path -- string -/// commiter -- string +/// committer -- string /// /// Consistency: This store is expected to have read-after-write consistency /// consistent_read should always be set to true /// -/// Transaction Safty: This store uses DynamoDB conditional write to ensure +/// Transaction Safety: This store uses DynamoDB conditional write to ensure /// only one writer can win per version. #[derive(Debug)] pub struct DynamoDBExternalManifestStore { client: Arc, table_name: String, - commiter_name: String, + committer_name: String, } // these are in macro because I want to use them in a match statement @@ -121,9 +121,9 @@ macro_rules! path { "path" }; } -macro_rules! commiter { +macro_rules! committer { () => { - "commiter" + "committer" }; } @@ -131,7 +131,7 @@ impl DynamoDBExternalManifestStore { pub async fn new_external_store( client: Arc, table_name: &str, - commiter_name: &str, + committer_name: &str, ) -> Result> { lazy_static::lazy_static! { static ref SANITY_CHECK_CACHE: RwLock> = RwLock::new(HashSet::new()); @@ -140,7 +140,7 @@ impl DynamoDBExternalManifestStore { let store = Arc::new(Self { client: client.clone(), table_name: table_name.to_string(), - commiter_name: commiter_name.to_string(), + committer_name: committer_name.to_string(), }); // already checked this table before, skip @@ -170,7 +170,7 @@ impl DynamoDBExternalManifestStore { ) })?; - let mut has_hask_key = false; + let mut has_hash_key = false; let mut has_range_key = false; // there should be two keys, HASH(base_uri) and RANGE(version) @@ -183,7 +183,7 @@ impl DynamoDBExternalManifestStore { })?; match (key.key_type, key.attribute_name.as_str()) { (KeyType::Hash, base_uri!()) => { - has_hask_key = true; + has_hash_key = true; } (KeyType::Range, version!()) => { has_range_key = true; @@ -201,7 +201,7 @@ impl DynamoDBExternalManifestStore { } // Both keys must be present - if !(has_hask_key && has_range_key) { + if !(has_hash_key && has_range_key) { return Err( Error::io( format!("dynamodb table: {} must have HASH and RANGE keys, named `{}` and `{}` respectively", table_name, base_uri!(), version!()), @@ -292,7 +292,7 @@ impl ExternalManifestStore for DynamoDBExternalManifestStore { if items.len() > 1 { return Err(Error::io( format!( - "dynamodb table: {} return unexpect number of items", + "dynamodb table: {} return unexpected number of items", self.table_name ), location!(), @@ -300,7 +300,7 @@ impl ExternalManifestStore for DynamoDBExternalManifestStore { } let item = items.pop().expect("length checked"); - let version_attibute = item + let version_attribute = item .get(version!()) .ok_or_else(|| Error::io( @@ -318,7 +318,7 @@ impl ExternalManifestStore for DynamoDBExternalManifestStore { ) )?; - match (version_attibute, path_attribute) { + match (version_attribute, path_attribute) { (AttributeValue::N(version), AttributeValue::S(path)) => Ok(Some(( version.parse().map_err(|e| Error::io( format!("dynamodb error: could not parse the version number returned {}, error: {}", version, e), @@ -342,7 +342,7 @@ impl ExternalManifestStore for DynamoDBExternalManifestStore { .item(base_uri!(), AttributeValue::S(base_uri.into())) .item(version!(), AttributeValue::N(version.to_string())) .item(path!(), AttributeValue::S(path.to_string())) - .item(commiter!(), AttributeValue::S(self.commiter_name.clone())) + .item(committer!(), AttributeValue::S(self.committer_name.clone())) .condition_expression(format!( "attribute_not_exists({}) AND attribute_not_exists({})", base_uri!(), @@ -361,7 +361,7 @@ impl ExternalManifestStore for DynamoDBExternalManifestStore { .item(base_uri!(), AttributeValue::S(base_uri.into())) .item(version!(), AttributeValue::N(version.to_string())) .item(path!(), AttributeValue::S(path.to_string())) - .item(commiter!(), AttributeValue::S(self.commiter_name.clone())) + .item(committer!(), AttributeValue::S(self.committer_name.clone())) .condition_expression(format!( "attribute_exists({}) AND attribute_exists({})", base_uri!(), diff --git a/rust/lance-table/src/io/commit/external_manifest.rs b/rust/lance-table/src/io/commit/external_manifest.rs index 0ddd9d4396..75cd0003fd 100644 --- a/rust/lance-table/src/io/commit/external_manifest.rs +++ b/rust/lance-table/src/io/commit/external_manifest.rs @@ -31,7 +31,7 @@ use crate::io::commit::{CommitError, CommitHandler, ManifestWriter}; /// expected to work in tandem with the object store. We are only leveraging /// the external store for concurrent commit. Any manifest committed thru this /// trait should ultimately be materialized in the object store. -/// For a visual explaination of the commit loop see +/// For a visual explanation of the commit loop see /// https://github.com/lancedb/lance/assets/12615154/b0822312-0826-432a-b554-3965f8d48d04 #[async_trait] pub trait ExternalManifestStore: std::fmt::Debug + Send + Sync { diff --git a/rust/lance-table/src/io/manifest.rs b/rust/lance-table/src/io/manifest.rs index 12196c1573..43e372713c 100644 --- a/rust/lance-table/src/io/manifest.rs +++ b/rust/lance-table/src/io/manifest.rs @@ -53,7 +53,7 @@ pub async fn read_manifest(object_store: &ObjectStore, path: &Path) -> Result Tokens { /// ``` /// /// By default this wrapper will do nothing. To then get tracing output, set the -/// LANCE_TRACING enviornment variable to your desired level (e.g. "debug"). +/// LANCE_TRACING environment variable to your desired level (e.g. "debug"). /// /// Example: /// diff --git a/rust/lance/src/dataset/fragment.rs b/rust/lance/src/dataset/fragment.rs index 284a741ecc..5bb5fe48d7 100644 --- a/rust/lance/src/dataset/fragment.rs +++ b/rust/lance/src/dataset/fragment.rs @@ -446,7 +446,7 @@ impl FileFragment { if file_version != dataset.manifest.data_storage_format.lance_file_version()? { return Err(Error::io( format!( - "File version mismatch. Dataset verison: {:?} Fragment version: {:?}", + "File version mismatch. Dataset version: {:?} Fragment version: {:?}", dataset.manifest.data_storage_format.lance_file_version()?, file_version ), @@ -1709,7 +1709,7 @@ impl FragmentReader { if !params.valid_given_len(total_num_rows as usize) { return Err(Error::invalid_input( format!( - "Invalid read params {} for fragment with {} addressible rows", + "Invalid read params {} for fragment with {} addressable rows", params, total_num_rows ), location!(), @@ -2083,11 +2083,11 @@ mod tests { let mut dataset = create_dataset(test_uri, LanceFileVersion::Legacy).await; // Delete last 20 rows in first fragment dataset.delete("i >= 20").await.unwrap(); - // Last fragment has 20 rows but 40 addressible rows + // Last fragment has 20 rows but 40 addressable rows let fragment = &dataset.get_fragments()[0]; assert_eq!(fragment.metadata.num_rows().unwrap(), 20); - // Test with take_range (all rows addressible) + // Test with take_range (all rows addressable) for with_row_id in [false, true] { let reader = fragment .open( @@ -2111,7 +2111,7 @@ mod tests { } } - // Test with read_range (only non-deleted rows addressible) + // Test with read_range (only non-deleted rows addressable) for with_row_id in [false, true] { let reader = fragment .open( diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index cfe4bf67ba..fef45e5f1c 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -640,7 +640,7 @@ async fn rewrite_files( let previous_writer_version = &dataset.manifest.writer_version; // The versions of Lance prior to when we started writing the writer version - // sometimes wrote incorrect `Fragment.phyiscal_rows` values, so we should + // sometimes wrote incorrect `Fragment.physical_rows` values, so we should // make sure to recompute them. // See: https://github.com/lancedb/lance/issues/1531 let recompute_stats = previous_writer_version.is_none(); diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index ce49568087..bc7537938a 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -4068,7 +4068,7 @@ mod test { for use_projection in [false, true] { for use_deleted_data in [false, true] { for use_new_data in [false, true] { - // Don't test compaction in conjuction with deletion and new data, it's too + // Don't test compaction in conjunction with deletion and new data, it's too // many combinations with no clear benefit. Feel free to update if there is // a need // TODO: enable compaction for stable row id once supported. diff --git a/rust/lance/src/dataset/transaction.rs b/rust/lance/src/dataset/transaction.rs index cf5a2e991b..e4dabf78c2 100644 --- a/rust/lance/src/dataset/transaction.rs +++ b/rust/lance/src/dataset/transaction.rs @@ -683,7 +683,7 @@ impl Transaction { && matches!(self.operation, Operation::Overwrite { .. }) { // If this is an overwrite operation and the user has requested a specific version - // then ovewrite with that version. Otherwise, if the user didn't request a specific + // then overwrite with that version. Otherwise, if the user didn't request a specific // version, then overwrite with whatever version we had before. prev_manifest.data_storage_format = DataStorageFormat::new(user_requested_version.unwrap()); diff --git a/rust/lance/src/dataset/write/merge_insert.rs b/rust/lance/src/dataset/write/merge_insert.rs index ba8e8bb9e5..25e915be50 100644 --- a/rust/lance/src/dataset/write/merge_insert.rs +++ b/rust/lance/src/dataset/write/merge_insert.rs @@ -1601,7 +1601,7 @@ mod tests { let data = lance_datagen::gen() .with_seed(Seed::from(1)) .col("value", array::step::()) - .col("key", array::rand_pseduo_uuid_hex()); + .col("key", array::rand_pseudo_uuid_hex()); let data = data.into_reader_rows(RowCount::from(1024), BatchCount::from(32)); let schema = data.schema(); @@ -1616,7 +1616,7 @@ mod tests { let data = lance_datagen::gen() .with_seed(Seed::from(2)) .col("value", array::step::()) - .col("key", array::rand_pseduo_uuid_hex()); + .col("key", array::rand_pseudo_uuid_hex()); let data = data.into_reader_rows(RowCount::from(1024), BatchCount::from(8)); let ds = Dataset::write( data, @@ -1732,7 +1732,7 @@ mod tests { .with_seed(Seed::from(1)) .col("other", array::rand_utf8(4.into(), false)) .col("value", array::step::()) - .col("key", array::rand_pseduo_uuid_hex()); + .col("key", array::rand_pseudo_uuid_hex()); let batch = data.into_batch_rows(RowCount::from(1024)).unwrap(); let batch1 = batch.slice(0, 512); let batch2 = batch.slice(512, 512); diff --git a/rust/lance/src/dataset/write/update.rs b/rust/lance/src/dataset/write/update.rs index 6d413081bb..6996b382cb 100644 --- a/rust/lance/src/dataset/write/update.rs +++ b/rust/lance/src/dataset/write/update.rs @@ -32,7 +32,7 @@ use crate::{Error, Result}; /// Build an update operation. /// /// This operation is similar to SQL's UPDATE statement. It allows you to change -/// the values of all or a subset of columns with SQL expresions. +/// the values of all or a subset of columns with SQL expressions. /// /// Use the [UpdateBuilder] to construct an update job. For example: /// diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index d468084571..37798a873e 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -281,7 +281,7 @@ impl DatasetIndexExt for Dataset { .expect("already checked") .clone() .to_vector() - // this should never happen beause we control the registration + // this should never happen because we control the registration // if this fails, the registration logic has a bug .ok_or(Error::Internal { message: "unable to cast index extension to vector".to_string(), @@ -451,7 +451,7 @@ impl DatasetIndexExt for Dataset { new_frag_ids |= removed_idx.fragment_bitmap.as_ref().unwrap(); } - let last_idx = deltas.last().expect("Delte indices should not be empty"); + let last_idx = deltas.last().expect("Delta indices should not be empty"); let new_idx = IndexMetadata { uuid: new_id, name: last_idx.name.clone(), // Keep the same name diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs index fd51250b60..3c6a377dd5 100644 --- a/rust/lance/src/index/append.rs +++ b/rust/lance/src/index/append.rs @@ -35,7 +35,7 @@ pub async fn merge_indices<'a>( ) -> Result, RoaringBitmap)>> { if old_indices.is_empty() { return Err(Error::Index { - message: "Append index: no prevoius index found".to_string(), + message: "Append index: no previous index found".to_string(), location: location!(), }); }; diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs index 1dcee9c493..8aeb40d23f 100644 --- a/rust/lance/src/index/vector/builder.rs +++ b/rust/lance/src/index/vector/builder.rs @@ -261,7 +261,7 @@ impl IvfIndexBuilde Ok(()) } - // shuffle the unindexed data and exsiting indices + // shuffle the unindexed data and existing indices // data must be with schema | ROW_ID | vector_column | // the shuffled data will be with schema | ROW_ID | PART_ID | code_column | pub async fn shuffle_data( diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index 256eb4f7f8..87b1cab44c 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -1073,7 +1073,7 @@ fn sanity_check<'a>(dataset: &'a Dataset, column: &str) -> Result<&'a Field> { } fn sanity_check_ivf_params(ivf: &IvfBuildParams) -> Result<()> { - if ivf.precomputed_partitons_file.is_some() && ivf.centroids.is_none() { + if ivf.precomputed_partitions_file.is_some() && ivf.centroids.is_none() { return Err(Error::Index { message: "precomputed_partitions_file requires centroids to be set".to_string(), location: location!(), @@ -1087,10 +1087,10 @@ fn sanity_check_ivf_params(ivf: &IvfBuildParams) -> Result<()> { }); } - if ivf.precomputed_shuffle_buffers.is_some() && ivf.precomputed_partitons_file.is_some() { + if ivf.precomputed_shuffle_buffers.is_some() && ivf.precomputed_partitions_file.is_some() { return Err(Error::Index { message: - "precomputed_shuffle_buffers and precomputed_partitons_file are mutually exclusive" + "precomputed_shuffle_buffers and precomputed_partitions_file are mutually exclusive" .to_string(), location: location!(), }); @@ -1233,7 +1233,7 @@ async fn scan_index_field_stream( async fn load_precomputed_partitions_if_available( ivf_params: &IvfBuildParams, ) -> Result>> { - match &ivf_params.precomputed_partitons_file { + match &ivf_params.precomputed_partitions_file { Some(file) => { info!("Loading precomputed partitions from file: {}", file); let mut builder = DatasetBuilder::from_uri(file); diff --git a/rust/lance/src/index/vector/ivf/builder.rs b/rust/lance/src/index/vector/ivf/builder.rs index 72a8fa62c5..f29d1bda8a 100644 --- a/rust/lance/src/index/vector/ivf/builder.rs +++ b/rust/lance/src/index/vector/ivf/builder.rs @@ -54,7 +54,7 @@ pub(super) async fn build_partitions( pq: ProductQuantizer, metric_type: MetricType, part_range: Range, - precomputed_partitons: Option>, + precomputed_partitions: Option>, shuffle_partition_batches: usize, shuffle_partition_concurrency: usize, precomputed_shuffle_buffers: Option<(Path, Vec)>, @@ -86,7 +86,7 @@ pub(super) async fn build_partitions( data, column, ivf_transformer.into(), - precomputed_partitons, + precomputed_partitions, ivf.num_partitions() as u32, shuffle_partition_batches, shuffle_partition_concurrency, diff --git a/rust/lance/src/index/vector/pq.rs b/rust/lance/src/index/vector/pq.rs index 76e64214b5..0cdad98ed9 100644 --- a/rust/lance/src/index/vector/pq.rs +++ b/rust/lance/src/index/vector/pq.rs @@ -163,7 +163,7 @@ impl Index for PQIndex { Ok(RoaringBitmap::from_sorted_iter(frag_ids).unwrap()) } else { Err(Error::Index { - message: "PQIndex::caclulate_included_frags: PQ is not initialized".to_string(), + message: "PQIndex::calculate_included_frags: PQ is not initialized".to_string(), location: location!(), }) } diff --git a/rust/lance/src/io/exec/optimizer.rs b/rust/lance/src/io/exec/optimizer.rs index 028b246653..b05e5f5feb 100644 --- a/rust/lance/src/io/exec/optimizer.rs +++ b/rust/lance/src/io/exec/optimizer.rs @@ -70,7 +70,7 @@ impl PhysicalOptimizerRule for SimplifyProjection { // TODO: we could try to coalesce consecutive projections, something for later // For now, we just keep things simple and only remove NoOp projections - // output has differnet schema, projection needed + // output has different schema, projection needed if input.schema() != proj.schema() { return Ok(Transformed::no(plan)); } diff --git a/rust/lance/src/utils/future.rs b/rust/lance/src/utils/future.rs index e771806618..00a558b14c 100644 --- a/rust/lance/src/utils/future.rs +++ b/rust/lance/src/utils/future.rs @@ -38,9 +38,9 @@ impl SharedPrerequisite { self.0 .try_get() // There was no call to wait_ready and the value was accessed to early - .expect("SharedPrequisite cached value accessed without call to wait_ready") + .expect("SharedPrerequisite cached value accessed without call to wait_ready") // There was no call to wait_ready and the value was actually ready, but failed - .expect("SharedPrequisite cached value accessed without call to wait_ready") + .expect("SharedPrerequisite cached value accessed without call to wait_ready") } /// Asynchronously wait for the output to be ready