chore: adds crate-ci/typos to check repository's spelling (#3022)

This PR tries to introduce the spelling check workflow from [typos](https://github.com/crate-ci/typos) to ensure we have correct spelling in our repository. to escape the typo checking, we can add the words and files that we want to escape to `lance_repo/.typos.toml` like this: ``` [default.extend-words] DNE = "DNE" arange = "arange" nd = "nd" terrestial = "terrestial" abd = "abd" afe = "afe" [files] extend-exclude = ["notebooks/*.ipynb"] ```
lancedb · Oct 22, 2024 · 27b919f · 27b919f
1 parent 7413344
commit 27b919f
Show file tree

Hide file tree

Showing 81 changed files with 179 additions and 156 deletions.
diff --git a/.github/workflows/typos.yml b/.github/workflows/typos.yml
@@ -0,0 +1,13 @@
+name: Typo checker
+on: [pull_request]
+
+jobs:
+  run:
+    name: Spell Check with Typos
+    runs-on: "ubuntu-24.04"
+    steps:
+    - name: Checkout Actions Repository
+      uses: actions/checkout@v4
+
+    - name: Check spelling of the entire repository
+      uses: crate-ci/[email protected]
diff --git a/.typos.toml b/.typos.toml
@@ -0,0 +1,10 @@
+[default.extend-words]
+DNE = "DNE"
+arange = "arange"
+nd = "nd"
+terrestial = "terrestial"
+abd = "abd"
+afe = "afe"
+
+[files]
+extend-exclude = ["notebooks/*.ipynb"]
diff --git a/benchmarks/flat/benchmark.py b/benchmarks/flat/benchmark.py
@@ -30,17 +30,17 @@ def benchmark(
     dim: int,
     metric: str,
 ):
-    querys = [np.random.random((dim,)).reshape(-1) for _ in range(32)]
+    queries = [np.random.random((dim,)).reshape(-1) for _ in range(32)]
     # warmup
-    for query in querys:
+    for query in queries:
         ds.to_table(
             nearest={"column": "vector", "k": 10, "q": query, "use_index": False}
         )
 
     latency = []
 
     for _ in range(10):
-        for query in querys:
+        for query in queries:
             start = time.perf_counter()
             ds.to_table(
                 nearest={

diff --git a/benchmarks/full_report/report.ipynb b/benchmarks/full_report/report.ipynb
@@ -2435,7 +2435,7 @@
     }
    ],
    "source": [
-    "# test NYT -- TF-IDF sparse vectors projected on to 256D dense -- normlized L2\n",
+    "# test NYT -- TF-IDF sparse vectors projected on to 256D dense -- normalized L2\n",
     "data = _get_nyt_vectors()\n",
     "data = data[np.linalg.norm(data, axis=1) != 0]\n",
     "data = np.unique(data, axis=0)\n",

diff --git a/benchmarks/sift/README.md b/benchmarks/sift/README.md
@@ -5,7 +5,7 @@ Dataset URI: http://corpus-texmex.irisa.fr/
 The SIFT/GIST-1M benchmarks make use of the [LanceDB](https://github.com/lancedb/lancedb) API to index, manage and query the datasets. Ensure the dependencies are installed. LanceDB is built on top of Lance and stores everything as Lance datasets.
 
 ```sh
-# Pin the lancedb version to the latest one availale on your own benchmark
+# Pin the lancedb version to the latest one available on your own benchmark
 pip lancedb==0.3.6
 pip install pandas~=2.1.0
 pip duckdb~=0.9.0

diff --git a/benchmarks/sift/gt.py b/benchmarks/sift/gt.py
@@ -34,7 +34,7 @@ def generate_gt(args):
     col = args.col or infer_vector_column(ds)
     if col is None:
         raise ValueError(
-            "Can not infer vector column, please specifiy the column explicitly"
+            "Can not infer vector column, please specify the column explicitly"
         )
 
     samples = ds.sample(args.samples, columns=[col])[col]

diff --git a/benchmarks/sift/perf.py b/benchmarks/sift/perf.py
@@ -77,9 +77,9 @@ def summary(self):
         series = []
         for k, v in self._configs.items():
             timer = self._timers[k]
-            config_ser = pd.Series(v)
-            time_ser = timer.summary()
-            series.append(pd.concat([config_ser, time_ser]))
+            config_series = pd.Series(v)
+            time_series = timer.summary()
+            series.append(pd.concat([config_series, time_series]))
         return pd.DataFrame(series)
 
 

diff --git a/docs/examples/llm_dataset_creation.rst b/docs/examples/llm_dataset_creation.rst
@@ -3,7 +3,7 @@ Creating text dataset for LLM training using Lance
 
 Lance can be used for creating and caching a text (or code) dataset for pre-training / fine-tuning of Large Language Models.
 The need for this arises when one needs to train a model on a subset of data or process the data in chunks without downloading
-all of it on the disk at once. This becomes a considerable problem when you just want a subset of a Terrabyte or Petabyte-scale dataset.
+all of it on the disk at once. This becomes a considerable problem when you just want a subset of a Terabyte or Petabyte-scale dataset.
 
 In this example, we will be bypassing this problem by downloading a text dataset in parts, tokenizing it and saving it as a Lance dataset. 
 This can be done for as many or as few data samples as you wish with average memory consumption approximately 3-4 GBs!
@@ -41,7 +41,7 @@ Now we will define a function to help us with tokenizing our samples, one-by-one
     def tokenize(sample, field='text'):
         return tokenizer(sample[field])['input_ids']
 
-This function will recieve a sample from a huggingface dataset and tokenize the values in the `field` column. This is the main text you want 
+This function will receive a sample from a huggingface dataset and tokenize the values in the `field` column. This is the main text you want 
 to tokenize.
 
 Creating a Lance dataset
@@ -70,7 +70,7 @@ let's define the main function that takes in the dataset, number of samples and
             )
 
 This function will be iterating over the huggingface dataset, one sample at a time, tokenizing the sample and yielding a pyarrow `RecordBatch`
-with all the tokens. We will do this untill we have reached the `num_samples` number of samples or the end of the dataset, whichever comes first.
+with all the tokens. We will do this until we have reached the `num_samples` number of samples or the end of the dataset, whichever comes first.
 
 Please note that by 'sample', we mean one example (row) in the original dataset. What one example exactly means will depend on the dataset itself as it could 
 be one line or an entire file of text. In this example, it's varies in length between a line and a paragraph of text.

diff --git a/docs/examples/llm_training.rst b/docs/examples/llm_training.rst
@@ -9,7 +9,7 @@ In this example, we will be training an LLM using 🤗 transformers on the token
 
 Imports and Setup
 ~~~~~~~~~~~~~~~~~
-Let's setup our enviornment by doing all the necessary imports and defining a few basic things.
+Let's setup our environment by doing all the necessary imports and defining a few basic things.
 
 .. code-block:: python
 

diff --git a/docs/format.rst b/docs/format.rst
@@ -108,7 +108,7 @@ The following values are supported:
       - 0.16.0
       - Any
       - Rework of the Lance file format that removed row groups and introduced null
-        support for lists, fixed size lists, and primtives
+        support for lists, fixed size lists, and primitives
     * - 2.1 (unstable)
       - None
       - Any

diff --git a/docs/performance.rst b/docs/performance.rst
@@ -7,7 +7,7 @@ Threading Model
 ---------------
 
 Lance is designed to be thread-safe and performant.  Lance APIs can be called concurrently unless
-explicity stated otherwise.  Users may create multiple tables and share tables between threads.
+explicitly stated otherwise.  Users may create multiple tables and share tables between threads.
 Operations may run in parallel on the same table, but some operations may lead to conflicts.  For
 details see :ref:`conflict_resolution`.
 
@@ -80,4 +80,4 @@ with 1024 rows per batch is more appropriate.
 
 In summary, scans could use up to ``(2 * io_buffer_size) + (batch_size * num_compute_threads)`` bytes of memory.
 Keep in mind that ``io_buffer_size`` is a soft limit (e.g. we cannot read less than one page at a time right now)
-and so it is not neccesarily a bug if you see memory usage exceed this limit by a small margin.
+and so it is not necessarily a bug if you see memory usage exceed this limit by a small margin.
diff --git a/java/core/lance-jni/src/blocking_dataset.rs b/java/core/lance-jni/src/blocking_dataset.rs
@@ -302,8 +302,8 @@ fn attach_native_dataset<'local>(
 }
 
 fn create_java_dataset_object<'a>(env: &mut JNIEnv<'a>) -> Result<JObject<'a>> {
-    let objet = env.new_object("com/lancedb/lance/Dataset", "()V", &[])?;
-    Ok(objet)
+    let object = env.new_object("com/lancedb/lance/Dataset", "()V", &[])?;
+    Ok(object)
 }
 
 #[no_mangle]

diff --git a/java/core/lance-jni/src/blocking_scanner.rs b/java/core/lance-jni/src/blocking_scanner.rs
@@ -121,7 +121,7 @@ fn inner_create_scanner<'local>(
 
     let mut scanner = dataset_guard.inner.scan();
 
-    // handle frament_ids
+    // handle fragment_ids
     if let Some(fragment_ids) = fragment_ids_opt {
         let mut fragments = Vec::with_capacity(fragment_ids.len());
         for fragment_id in fragment_ids {

diff --git a/protos/file2.proto b/protos/file2.proto
@@ -49,7 +49,7 @@ import "google/protobuf/empty.proto";
 //       
 //       If direct I/O is required then most (but not all) fields described
 //       below must be sector aligned.  We have marked these fields with an
-//       asterick for clarity.  Readers should assume there will be optional
+//       asterisk for clarity.  Readers should assume there will be optional
 //       padding inserted before these fields.
 //
 //       All footer fields are unsigned integers written with  little endian
@@ -96,7 +96,7 @@ import "google/protobuf/empty.proto";
 //
 // ## Data Pages
 //
-// A lot of flexiblity is provided in how data is stored.  Note that the file
+// A lot of flexibility is provided in how data is stored.  Note that the file
 // layout has no explicit notion of a page (however, it is a part of the column
 // metadata).  A page's buffers do not strictly need to be contiguous on the
 // disk.  However, it is recommended that buffers within a page be grouped

diff --git a/python/python/benchmarks/test_index.py b/python/python/benchmarks/test_index.py
@@ -163,7 +163,7 @@ def test_train_ivf(test_large_dataset, benchmark, num_partitions):
     )
 
 
-# Pre-computing partition assigment only makes sense on CUDA and so this benchmark runs
+# Pre-computing partition assignment only makes sense on CUDA and so this benchmark runs
 # only on CUDA.
 @pytest.mark.benchmark(group="assign_partitions")
 @pytest.mark.parametrize("num_partitions", [100, 300])

diff --git a/python/python/lance/_arrow/bf16.py b/python/python/lance/_arrow/bf16.py
@@ -105,12 +105,12 @@ def __init__(self):
         pa.ExtensionType.__init__(self, pa.binary(2), "lance.bfloat16")
 
     def __arrow_ext_serialize__(self):
-        # TODO: encode endianess
+        # TODO: encode endianness
         return b""
 
     @classmethod
     def __arrow_ext_deserialize__(self, storage_type, serialized):
-        # TODO: decode endianess
+        # TODO: decode endianness
         return BFloat16Type()
 
     def __arrow_ext_class__(self):

diff --git a/python/python/lance/_dataset/sharded_batch_iterator.py b/python/python/lance/_dataset/sharded_batch_iterator.py
@@ -21,7 +21,7 @@
 class ShardedBatchIterator:
     """An iterator of RecordBatches, over the sharded dataset.
 
-    Parmeters
+    Parameters
     ---------
     uri: str or Path
         Dataset base URI

diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -938,7 +938,7 @@ def add_columns(
             The names of the columns that the UDF will read. If None, then the
             UDF will read all columns. This is only used when transforms is a
             UDF. Otherwise, the read columns are inferred from the SQL expressions.
-        reader_scheam: pa.Schema, optional
+        reader_schema: pa.Schema, optional
             Only valid if transforms is a `ReaderLike` object.  This will be used to
             determine the schema of the reader.
         batch_size: int, optional

diff --git a/python/python/lance/ray/sink.py b/python/python/lance/ray/sink.py
@@ -351,7 +351,7 @@ def __call__(self, batch: Union[pa.Table, "pd.DataFrame"]) -> Dict[str, Any]:
 
 
 class LanceCommitter(_BaseLanceDatasink):
-    """Lance Commiter as Ray Datasink.
+    """Lance Committer as Ray Datasink.
 
     This is used with `LanceFragmentWriter` to write large-than-memory data to
     lance file.
@@ -362,7 +362,7 @@ def num_rows_per_write(self) -> int:
         return 1
 
     def get_name(self) -> str:
-        return f"LanceCommiter({self.mode})"
+        return f"LanceCommitter({self.mode})"
 
     def write(
         self,

diff --git a/python/python/tests/test_optimize.py b/python/python/tests/test_optimize.py
@@ -130,7 +130,7 @@ def test_compact_with_write(tmp_path: Path):
     # This test creates a dataset with a manifest containing fragments
     # that are not in sorted order (by id)
     #
-    # We do this by runnign compaction concurrently with append
+    # We do this by running compaction concurrently with append
     #
     # This is because compaction first reserves a fragment id.  Then the
     # concurrent writes grab later ids and commit them.  Then the compaction

diff --git a/python/python/tests/test_ray.py b/python/python/tests/test_ray.py
@@ -17,7 +17,7 @@
     _register_hooks,
 )
 
-# Use this hook until we have offical DataSink in Ray.
+# Use this hook until we have official DataSink in Ray.
 _register_hooks()
 
 ray.init()

diff --git a/python/src/dataset.rs b/python/src/dataset.rs
@@ -1702,7 +1702,7 @@ fn prepare_vector_index_params(
         };
 
         if let Some(f) = kwargs.get_item("precomputed_partitions_file")? {
-            ivf_params.precomputed_partitons_file = Some(f.to_string());
+            ivf_params.precomputed_partitions_file = Some(f.to_string());
         };
 
         if let Some(storage_options) = storage_options {

diff --git a/rust/lance-core/src/datatypes.rs b/rust/lance-core/src/datatypes.rs
@@ -262,7 +262,7 @@ impl TryFrom<&LogicalType> for DataType {
                 "dict" => {
                     if splits.len() != 4 {
                         Err(Error::Schema {
-                            message: format!("Unsupport dictionary type: {}", lt),
+                            message: format!("Unsupported dictionary type: {}", lt),
                             location: location!(),
                         })
                     } else {
@@ -274,7 +274,7 @@ impl TryFrom<&LogicalType> for DataType {
                 "decimal" => {
                     if splits.len() != 4 {
                         Err(Error::Schema {
-                            message: format!("Unsupport decimal type: {}", lt),
+                            message: format!("Unsupported decimal type: {}", lt),
                             location: location!(),
                         })
                     } else {

diff --git a/rust/lance-core/src/utils/testing.rs b/rust/lance-core/src/utils/testing.rs
@@ -57,7 +57,7 @@ pub struct ProxyObjectStorePolicy {
     /// be returned instead.
     before_policies: HashMap<String, PolicyFn>,
     /// Policies which run after calls that return ObjectMeta.  The policy can
-    /// tranform the returned ObjectMeta to mock out file listing results.
+    /// transform the returned ObjectMeta to mock out file listing results.
     object_meta_policies: HashMap<String, ObjectMetaPolicyFn>,
 }
 

diff --git a/rust/lance-datafusion/src/exec.rs b/rust/lance-datafusion/src/exec.rs
@@ -35,14 +35,14 @@ use log::{debug, info, warn};
 /// An source execution node created from an existing stream
 ///
 /// It can only be used once, and will return the stream.  After that the node
-/// is exhuasted.
+/// is exhausted.
 ///
 /// Note: the stream should be finite, otherwise we will report datafusion properties
 /// incorrectly.
 pub struct OneShotExec {
     stream: Mutex<Option<SendableRecordBatchStream>>,
     // We save off a copy of the schema to speed up formatting and so ExecutionPlan::schema & display_as
-    // can still function after exhuasted
+    // can still function after exhausted
     schema: Arc<ArrowSchema>,
     properties: PlanProperties,
 }
@@ -91,7 +91,7 @@ impl DisplayAs for OneShotExec {
         let stream = self.stream.lock().unwrap();
         match t {
             DisplayFormatType::Default | DisplayFormatType::Verbose => {
-                let exhausted = if stream.is_some() { "" } else { "EXHUASTED " };
+                let exhausted = if stream.is_some() { "" } else { "EXHAUSTED" };
                 let columns = self
                     .schema
                     .field_names()

diff --git a/rust/lance-datafusion/src/logical_expr.rs b/rust/lance-datafusion/src/logical_expr.rs
@@ -287,7 +287,7 @@ pub mod tests {
 
     #[test]
     fn test_resolve_in_expr() {
-        // Type coersion should apply for `A IN (0)` or `A NOT IN (0)`
+        // Type coercion should apply for `A IN (0)` or `A NOT IN (0)`
         let arrow_schema = ArrowSchema::new(vec![Field::new("a", DataType::Float32, false)]);
         let expr = Expr::in_list(
             Expr::Column("a".to_string().into()),

diff --git a/rust/lance-datafusion/src/substrait.rs b/rust/lance-datafusion/src/substrait.rs
@@ -350,7 +350,7 @@ pub async fn parse_substrait(expr: &[u8], input_schema: Arc<Schema>) -> Result<E
 
     // When DF parses the above plan it turns column references into qualified references
     // into `dummy` (e.g. we get `WHERE dummy.x < 0` instead of `WHERE x < 0`)  We want
-    // these to be unqualified references instead and so we need a quick trasnformation pass
+    // these to be unqualified references instead and so we need a quick transformation pass
 
     let expr = expr.transform(&|node| match node {
         Expr::Column(column) => {

diff --git a/rust/lance-datagen/src/generator.rs b/rust/lance-datagen/src/generator.rs
@@ -470,9 +470,9 @@ impl ArrayGenerator for CycleVectorGenerator {
 }
 
 #[derive(Default)]
-pub struct PseduoUuidGenerator {}
+pub struct PseudoUuidGenerator {}
 
-impl ArrayGenerator for PseduoUuidGenerator {
+impl ArrayGenerator for PseudoUuidGenerator {
     fn generate(
         &mut self,
         length: RowCount,
@@ -497,9 +497,9 @@ impl ArrayGenerator for PseduoUuidGenerator {
 }
 
 #[derive(Default)]
-pub struct PseduoUuidHexGenerator {}
+pub struct PseudoUuidHexGenerator {}
 
-impl ArrayGenerator for PseduoUuidHexGenerator {
+impl ArrayGenerator for PseudoUuidHexGenerator {
     fn generate(
         &mut self,
         length: RowCount,
@@ -1581,8 +1581,8 @@ pub mod array {
     /// Note, these are "pseudo UUIDs".  They are 16-byte randomish values but they
     /// are not guaranteed to be unique.  We use a simplistic RNG that trades uniqueness
     /// for speed.
-    pub fn rand_pseduo_uuid() -> Box<dyn ArrayGenerator> {
-        Box::<PseduoUuidGenerator>::default()
+    pub fn rand_pseudo_uuid() -> Box<dyn ArrayGenerator> {
+        Box::<PseudoUuidGenerator>::default()
     }
 
     /// Create a generator of random UUIDs, stored as 32-character strings (hex encoding
@@ -1591,8 +1591,8 @@ pub mod array {
     /// Note, these are "pseudo UUIDs".  They are 16-byte randomish values but they
     /// are not guaranteed to be unique.  We use a simplistic RNG that trades uniqueness
     /// for speed.
-    pub fn rand_pseduo_uuid_hex() -> Box<dyn ArrayGenerator> {
-        Box::<PseduoUuidHexGenerator>::default()
+    pub fn rand_pseudo_uuid_hex() -> Box<dyn ArrayGenerator> {
+        Box::<PseudoUuidHexGenerator>::default()
     }
 
     pub fn rand_primitive<T: ArrowPrimitiveType + Send + Sync>(