feat: add string doc. and update to version 0.2.4

porameht · Aug 14, 2024 · c33799b · c33799b
1 parent 1133157
commit c33799b
Show file tree

Hide file tree

Showing 6 changed files with 68 additions and 16 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "thongna"
-version = "0.2.3"
+version = "0.2.4"
 edition = "2021"
 license = "Apache-2.0"
 authors = ["Porameht Khumsombat"]

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "thongna"
-version = "0.2.3"
+version = "0.2.4"
 requires-python = ">=3.8"
 description = "Blazing-fast Thai text processing library powered by Rust"
 authors = [
@@ -32,7 +32,7 @@ Repository = "https://github.com/porameht/thongna"
 
 [tool.maturin]
 features = ["pyo3/extension-module"]
-module-name = "_thongna"
+module-name = "thongna"
 python-source = "thongna"
 
 [tool.pytest.ini_options]

diff --git a/src/lib.rs b/src/lib.rs
@@ -38,7 +38,20 @@ lazy_static! {
 }
 
 #[pyfunction]
+#[pyo3(text_signature = "(text, whitespace_number=True)")]
 pub fn normalize(text: &str, whitespace_number: bool) -> PyResult<String> {
+    // Normalize Thai text.
+    //
+    // This function normalizes Thai text by applying various rules to standardize
+    // the text representation.
+    //
+    // Args:
+    //     text (str): Input text to be normalized
+    //     whitespace_number (bool, optional): If True, adds spaces around numbers. 
+    //                                         Defaults to True.
+    //
+    // Returns:
+    //     str: Normalized text
     let mut text = text.to_string();
 
     if whitespace_number {
@@ -66,7 +79,22 @@ pub fn normalize(text: &str, whitespace_number: bool) -> PyResult<String> {
 }
 
 #[pyfunction]
+#[pyo3(text_signature = "(text, dict_name, safe=False, parallel=False)")]
 fn newmm(text: &str, dict_name: &str, safe: bool, parallel: bool) -> PyResult<Vec<String>> {
+    // Break text into tokens.
+    //
+    // This method is an implementation of newmm segmentation.
+    // Supports multithread mode - set by parallel flag.
+    //
+    // Args:
+    //     text (str): Input text
+    //     dict_name (str): Dictionary name, as assigned in load_dict()
+    //     safe (bool, optional): Use safe mode to avoid long waiting time in
+    //         a text with lots of ambiguous word boundaries. Defaults to False.
+    //     parallel (bool, optional): Use multithread mode. Defaults to False.
+    //
+    // Returns:
+    //     List[str]: List of tokens
     if let Some(loaded_dict) = DICT_COLLECTION.read().unwrap().get(dict_name) {
         let result = loaded_dict.segment_to_string(text, safe, parallel);
         Ok(result)
@@ -79,7 +107,20 @@ fn newmm(text: &str, dict_name: &str, safe: bool, parallel: bool) -> PyResult<Ve
 }
 
 #[pyfunction]
+#[pyo3(text_signature = "(file_path, dict_name)")]
 fn load_dict(file_path: &str, dict_name: &str) -> PyResult<(String, bool)> {
+    // Load dictionary from a file.
+    //
+    // Load a dictionary file into an in-memory dictionary collection,
+    // and assign dict_name to it.
+    // This function does not override an existing dict name.
+    //
+    // Args:
+    //     file_path (str): Path to a dictionary file
+    //     dict_name (str): A unique dictionary name, used for reference
+    //
+    // Returns:
+    //     Tuple[str, bool]: A tuple containing a human-readable result string and a boolean
     let mut dict_col_lock = DICT_COLLECTION.write().unwrap();
     if dict_col_lock.get(dict_name).is_some() {
         Ok((
@@ -104,7 +145,7 @@ fn load_dict(file_path: &str, dict_name: &str) -> PyResult<(String, bool)> {
 }
 
 #[pymodule]
-fn _thongna(m: &Bound<'_, PyModule>) -> PyResult<()> {
+fn thongna(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(normalize, m)?)?;
     m.add_function(wrap_pyfunction!(newmm, m)?)?;
     m.add_function(wrap_pyfunction!(load_dict, m)?)?;

diff --git a/tests/test_newmm.py b/tests/test_newmm.py
@@ -1,12 +1,6 @@
 import unittest
 from typing import List
-import os
-import sys
-
-project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.insert(0, project_root)
-
-from thongna import load_dict, newmm
+from thongna_py import newmm, load_dict
 
 class TestTokenizePackage(unittest.TestCase):
     def setUp(self):

diff --git a/thongna/__init__.py → thongna_py/__init__.py b/thongna/__init__.py → thongna_py/__init__.py
@@ -1,9 +1,9 @@
 from pathlib import Path
 from typing import List, Tuple
 
-from _thongna import load_dict as rust_load_dict  # type: ignore
-from _thongna import newmm as rust_newmm  # type: ignore
-
+from thongna import load_dict as rust_load_dict  # type: ignore
+from thongna import newmm as rust_newmm  # type: ignore
+from thongna import normalize as rust_normalize # type: ignore
 
 def load_dict(file_path: str, dict_name: str) -> Tuple[str, bool]:
     """
@@ -49,4 +49,21 @@ def newmm(
     if not isinstance(text, str) or not text:
         return []
 
-    return rust_newmm(text, dict_name, safe, parallel)
+    return rust_newmm(text, dict_name, safe, parallel)
+
+def normalize(text: str, whitespace_number: bool = True) -> str:
+    """
+    Normalize Thai text.
+
+    This function normalizes Thai text by applying various rules to standardize
+    the text representation.
+
+    Args:
+        text (str): Input text to be normalized
+        whitespace_number (bool, optional): If True, adds spaces around numbers. 
+                                            Defaults to True.
+
+    Returns:
+        str: Normalized text
+    """
+    return rust_normalize(text, whitespace_number)