Skip to content

Commit

Permalink
feat: add string doc. and update to version 0.2.4
Browse files Browse the repository at this point in the history
  • Loading branch information
porameht committed Aug 14, 2024
1 parent 1133157 commit c33799b
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 16 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "thongna"
version = "0.2.3"
version = "0.2.4"
edition = "2021"
license = "Apache-2.0"
authors = ["Porameht Khumsombat"]
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "thongna"
version = "0.2.3"
version = "0.2.4"
requires-python = ">=3.8"
description = "Blazing-fast Thai text processing library powered by Rust"
authors = [
Expand Down Expand Up @@ -32,7 +32,7 @@ Repository = "https://github.com/porameht/thongna"

[tool.maturin]
features = ["pyo3/extension-module"]
module-name = "_thongna"
module-name = "thongna"
python-source = "thongna"

[tool.pytest.ini_options]
Expand Down
43 changes: 42 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,20 @@ lazy_static! {
}

#[pyfunction]
#[pyo3(text_signature = "(text, whitespace_number=True)")]
pub fn normalize(text: &str, whitespace_number: bool) -> PyResult<String> {
// Normalize Thai text.
//
// This function normalizes Thai text by applying various rules to standardize
// the text representation.
//
// Args:
// text (str): Input text to be normalized
// whitespace_number (bool, optional): If True, adds spaces around numbers.
// Defaults to True.
//
// Returns:
// str: Normalized text
let mut text = text.to_string();

if whitespace_number {
Expand Down Expand Up @@ -66,7 +79,22 @@ pub fn normalize(text: &str, whitespace_number: bool) -> PyResult<String> {
}

#[pyfunction]
#[pyo3(text_signature = "(text, dict_name, safe=False, parallel=False)")]
fn newmm(text: &str, dict_name: &str, safe: bool, parallel: bool) -> PyResult<Vec<String>> {
// Break text into tokens.
//
// This method is an implementation of newmm segmentation.
// Supports multithread mode - set by parallel flag.
//
// Args:
// text (str): Input text
// dict_name (str): Dictionary name, as assigned in load_dict()
// safe (bool, optional): Use safe mode to avoid long waiting time in
// a text with lots of ambiguous word boundaries. Defaults to False.
// parallel (bool, optional): Use multithread mode. Defaults to False.
//
// Returns:
// List[str]: List of tokens
if let Some(loaded_dict) = DICT_COLLECTION.read().unwrap().get(dict_name) {
let result = loaded_dict.segment_to_string(text, safe, parallel);
Ok(result)
Expand All @@ -79,7 +107,20 @@ fn newmm(text: &str, dict_name: &str, safe: bool, parallel: bool) -> PyResult<Ve
}

#[pyfunction]
#[pyo3(text_signature = "(file_path, dict_name)")]
fn load_dict(file_path: &str, dict_name: &str) -> PyResult<(String, bool)> {
// Load dictionary from a file.
//
// Load a dictionary file into an in-memory dictionary collection,
// and assign dict_name to it.
// This function does not override an existing dict name.
//
// Args:
// file_path (str): Path to a dictionary file
// dict_name (str): A unique dictionary name, used for reference
//
// Returns:
// Tuple[str, bool]: A tuple containing a human-readable result string and a boolean
let mut dict_col_lock = DICT_COLLECTION.write().unwrap();
if dict_col_lock.get(dict_name).is_some() {
Ok((
Expand All @@ -104,7 +145,7 @@ fn load_dict(file_path: &str, dict_name: &str) -> PyResult<(String, bool)> {
}

#[pymodule]
fn _thongna(m: &Bound<'_, PyModule>) -> PyResult<()> {
fn thongna(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(normalize, m)?)?;
m.add_function(wrap_pyfunction!(newmm, m)?)?;
m.add_function(wrap_pyfunction!(load_dict, m)?)?;
Expand Down
8 changes: 1 addition & 7 deletions tests/test_newmm.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
import unittest
from typing import List
import os
import sys

project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.insert(0, project_root)

from thongna import load_dict, newmm
from thongna_py import newmm, load_dict

class TestTokenizePackage(unittest.TestCase):
def setUp(self):
Expand Down
25 changes: 21 additions & 4 deletions thongna/__init__.py → thongna_py/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from pathlib import Path
from typing import List, Tuple

from _thongna import load_dict as rust_load_dict # type: ignore
from _thongna import newmm as rust_newmm # type: ignore

from thongna import load_dict as rust_load_dict # type: ignore
from thongna import newmm as rust_newmm # type: ignore
from thongna import normalize as rust_normalize # type: ignore

def load_dict(file_path: str, dict_name: str) -> Tuple[str, bool]:
"""
Expand Down Expand Up @@ -49,4 +49,21 @@ def newmm(
if not isinstance(text, str) or not text:
return []

return rust_newmm(text, dict_name, safe, parallel)
return rust_newmm(text, dict_name, safe, parallel)

def normalize(text: str, whitespace_number: bool = True) -> str:
"""
Normalize Thai text.
This function normalizes Thai text by applying various rules to standardize
the text representation.
Args:
text (str): Input text to be normalized
whitespace_number (bool, optional): If True, adds spaces around numbers.
Defaults to True.
Returns:
str: Normalized text
"""
return rust_normalize(text, whitespace_number)

0 comments on commit c33799b

Please sign in to comment.