Merge branch 'main' into dev/add_duckdb

haobibo · web-flow · commit c406bf0a3ffe · 2025-08-18T21:42:32.000+08:00
Signed-off-by: Bibo Hao &lt;haobibo@users.noreply.github.com&gt;
diff --git a/.github/workflows/pip.yml b/.github/workflows/pip.yml
@@ -1,10 +1,8 @@
 # Secret Variables required in GitHub secrets: TWINE_USERNAME, TWINE_PASSWORD / TWINE_USERNAME_TEST, TWINE_PASSWORD_TEST
 
-name: build
+name: build-pip-publish
 
-# Controls when the action will run. 
 on:
-  # Triggers the workflow on push or pull request events but only for the main branch
   push:
     branches: [ main ]
     paths-ignore: [ "*.md" ]
@@ -26,7 +24,7 @@ jobs:
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
       # sudo python setup.py install clean --all
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: pip-install-test
         run: |
@@ -48,7 +46,7 @@ jobs:
           sudo python3 -c "import fcntl; fcntl.fcntl(1, fcntl.F_SETFL, 0)"
           sudo python3 setup.py sdist bdist_wheel
           ls -alh ./dist
-          if [ "${GITHUB_REPOSITORY}" = "QPod/aloha" ] && [ "${GITHUB_REF_NAME}" = "main" ] ; then
+          if [ "${GITHUB_REPOSITORY}" = "QPod/aloha-python" ] && [ "${GITHUB_REF_NAME}" = "main" ] ; then
             twine upload dist/* --verbose -u "${TWINE_USERNAME}" -p "${TWINE_PASSWORD}" ;
           elif [ ! -z "${TWINE_USERNAME_TEST}" ]; then
             twine upload dist/* --verbose -u "${TWINE_USERNAME_TEST}" -p "${TWINE_PASSWORD_TEST}" \
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 # Aloha!
 
 [![License](https://img.shields.io/github/license/QPod/aloha)](https://github.com/QPod/aloha/blob/main/LICENSE)
-[![GitHub Workflow Status](https://img.shields.io/github/workflow/status/QPod/aloha/build)](https://github.com/QPod/aloha/actions)
+[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/QPod/aloha-python/pip.yml?branch=main)](https://github.com/QPod/aloha-python/actions)
 [![Join the Gitter Chat](https://img.shields.io/gitter/room/nwjs/nw.js.svg)](https://gitter.im/QPod/)
 [![PyPI version](https://img.shields.io/pypi/v/aloha)](https://pypi.python.org/pypi/aloha/)
 [![PyPI Downloads](https://img.shields.io/pypi/dm/aloha)](https://pepy.tech/badge/aloha/)
@@ -21,6 +21,6 @@ Please generously STAR★ our project or donate to us!  [![GitHub Starts](https:
 
 ## Getting started
 
-```py
+```shell
 pip install aloha[all]
 ```
diff --git a/demo/app_common/ainlp/__init__.py b/demo/app_common/ainlp/__init__.py
diff --git a/demo/app_common/ainlp/model_bert.py b/demo/app_common/ainlp/model_bert.py
@@ -0,0 +1,86 @@
+from typing import List
+
+import torch
+from transformers import AutoTokenizer, AutoModel
+
+from aloha.service.streamer import ManagedModel
+
+SEED = 0
+torch.manual_seed(SEED)
+torch.cuda.manual_seed(SEED)
+
+
+class TextUnmaskModel:
+    def __init__(self, max_sent_len=16, model_path="bert-base-uncased"):
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+        self.transformer = AutoModel.from_pretrained(self.model_path)
+        self.transformer.eval()
+        self.transformer.to(device="cuda")
+        self.max_sent_len = max_sent_len
+
+    def predict(self, batch: List[str]) -> List[str]:
+        """predict masked word"""
+        batch_inputs = []
+        masked_indexes = []
+
+        for text in batch:
+            tokenized_text = self.tokenizer.tokenize(text)
+            if len(tokenized_text) > self.max_sent_len - 2:
+                tokenized_text = tokenized_text[: self.max_sent_len - 2]
+
+            tokenized_text = ['[CLS]'] + tokenized_text + ['[SEP]']
+            tokenized_text += ['[PAD]'] * (self.max_sent_len - len(tokenized_text))
+
+            indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
+            batch_inputs.append(indexed_tokens)
+            masked_indexes.append(tokenized_text.index('[MASK]'))
+
+        tokens_tensor = torch.tensor(batch_inputs).to("cuda")
+
+        with torch.no_grad():
+            # prediction_scores: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            prediction_scores = self.transformer(tokens_tensor)[0]
+
+        batch_outputs = []
+        for i in range(len(batch_inputs)):
+            predicted_index = torch.argmax(prediction_scores[i, masked_indexes[i]]).item()
+            predicted_token = self.tokenizer.convert_ids_to_tokens(predicted_index)
+            batch_outputs.append(predicted_token)
+
+        return batch_outputs
+
+
+class ManagedBertModel(ManagedModel):
+    def init_model(self):
+        self.model = TextUnmaskModel()
+
+    def predict(self, batch):
+        return self.model.predict(batch)
+
+
+def test_simple():
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+    model = AutoModel.from_pretrained("bert-base-uncased")
+    inputs = tokenizer("Hello! My name is [MASK]!", return_tensors="pt")
+    outputs = model(**inputs)
+    print(outputs)
+
+    predicted_index = torch.argmax(outputs[1]).item()
+    predicted_token = tokenizer.convert_ids_to_tokens(predicted_index)
+    print(predicted_token)
+
+
+def test_batch():
+    batch_text = [
+        "twinkle twinkle [MASK] star.",
+        "Happy birthday to [MASK].",
+        'the answer to life, the [MASK], and everything.'
+    ]
+    model = TextUnmaskModel()
+    outputs = model.predict(batch_text)
+    print(outputs)
+
+
+if __name__ == "__main__":
+    test_simple()
diff --git a/demo/app_common/ainlp/test-gpu-async.py b/demo/app_common/ainlp/test-gpu-async.py
diff --git a/demo/app_common/api/api_multipart.py b/demo/app_common/api/api_multipart.py
@@ -0,0 +1,14 @@
+from aloha.logger import LOG
+from aloha.service.api.v0 import APIHandler
+
+
+class MultipartHandler(APIHandler):
+    def response(self, params=None, *args, **kwargs):
+        LOG.debug(params)
+        return params
+
+
+default_handlers = [
+    # internal API: QueryDB Postgres with sql directly
+    (r"/api_internal/multipart", MultipartHandler),
+]
diff --git a/demo/app_common/debug.py b/demo/app_common/debug.py
@@ -6,6 +6,7 @@ def main():
     modules_to_load = [
         "app_common.api.api_common_sys_info",
         "app_common.api.api_common_query_postgres",
+        "app_common.api.api_multipart",
     ]
 
     if 'service' not in SETTINGS.config:
diff --git a/src/README.md b/src/README.md
@@ -1,7 +1,7 @@
 # Aloha!
 
 [![License](https://img.shields.io/github/license/QPod/aloha)](https://github.com/QPod/aloha/blob/main/LICENSE)
-[![GitHub Workflow Status](https://img.shields.io/github/workflow/status/QPod/aloha/build)](https://github.com/QPod/aloha/actions)
+[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/QPod/aloha-python/pip.yml?branch=main)](https://github.com/QPod/aloha-python/actions)
 [![Join the Gitter Chat](https://img.shields.io/gitter/room/nwjs/nw.js.svg)](https://gitter.im/QPod/)
 [![PyPI version](https://img.shields.io/pypi/v/aloha)](https://pypi.python.org/pypi/aloha/)
 [![PyPI Downloads](https://img.shields.io/pypi/dm/aloha)](https://pepy.tech/badge/aloha/)
@@ -21,6 +21,6 @@ Please generously STAR★ our project or donate to us!  [![GitHub Starts](https:
 
 ## Getting started
 
-```py
+```shell
 pip install aloha[all]
 ```
diff --git a/src/aloha/config/paths.py b/src/aloha/config/paths.py
@@ -1,6 +1,7 @@
 __all__ = ('get_resource_dir', 'get_config_dir', 'get_current_module_dir', 'get_project_base_dir', 'path_join')
 
 import os
+import sys
 import warnings
 
 
@@ -24,7 +25,6 @@ def get_config_dir(*args) -> str:
     if dir_config is None or len(dir_config.strip()) == 0:
         dir_config = 'config'
     dir_config = path_join(dir_resource, dir_config, *args)
-    # print(' ---> Using config dir:', dir_config)
     return dir_config
 
 
@@ -48,15 +48,19 @@ def get_config_files() -> list:
 
     files = files_config.split(',')
     ret = []
+    msgs = []
     for f in files:
         file = get_config_dir(f)
         if not os.path.exists(file):
-            warnings.warn('Expecting config file [%s] but it does not exists!' % file)
+            msgs.append('Expecting config file [%s] but it does not exists!' % file)
         else:
-            print('  ---> Loading config file [%s]' % file)
+            print('  ---> Loading config file [%s]' % file, file=sys.stderr)
             ret.append(os.path.expandvars(f))
     if len(ret) == 0:
-        warnings.warn('No config files set properly, EMPTY config will be used!')
+        msgs.append('No config files set properly, EMPTY config will be used!')
+
+    if len(msgs) > 0:
+        warnings.warn('\n'.join(msgs))
     return ret
 
 
diff --git a/src/aloha/db/mysql.py b/src/aloha/db/mysql.py
@@ -24,7 +24,7 @@ def __init__(self, db_config, **kwargs):
         try:
             self.db = create_engine(
                 'mysql+pymysql://{user}:{password}@{host}:{port}/{dbname}'.format(**self._config),
-                encoding='utf-8', pool_size=50, pool_recycle=500, pool_pre_ping=True, **kwargs
+                pool_size=50, pool_recycle=500, pool_pre_ping=True, **kwargs
             )
             LOG.debug("MySQL connected: {host}:{port}/{dbname}".format(**self._config))
         except Exception as e:
diff --git a/src/aloha/db/postgres.py b/src/aloha/db/postgres.py
@@ -1,6 +1,6 @@
 __all__ = ('PostgresOperator',)
 
-import psycopg2
+import psycopg
 from sqlalchemy import create_engine
 from sqlalchemy.sql import text
 
@@ -26,8 +26,8 @@ def __init__(self, db_config, **kwargs):
 
         try:
             self.engine = create_engine(
-                'postgresql+psycopg2://{user}:{password}@{host}:{port}/{dbname}'.format(**self._config),
-                connect_args=connect_args, client_encoding='utf8', encoding='utf-8',
+                'postgresql+psycopg://{user}:{password}@{host}:{port}/{dbname}'.format(**self._config),
+                connect_args=connect_args, client_encoding='utf8',
                 pool_size=20, max_overflow=10, pool_pre_ping=True, **kwargs
             )
             LOG.debug("PostgresSQL connected: {host}:{port}/{dbname}".format(**self._config))
diff --git a/src/aloha/encrypt/vault/cyberark.py b/src/aloha/encrypt/vault/cyberark.py
@@ -11,7 +11,8 @@
 from ...logger import LOG
 
 requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
-requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ':HIGHT:!DH:!aNULL'
+if hasattr(requests.packages.urllib3.util.ssl_, 'DEFAULT_CIPHERS'):
+    requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ':HIGHT:!DH:!aNULL'
 
 
 class CyberArkVault(BaseVault, AesEncryptor):
diff --git a/src/aloha/service/api/v0.py b/src/aloha/service/api/v0.py
@@ -13,8 +13,10 @@ class APIHandler(AbstractApiHandler, ABC):
     }
 
     async def post(self, *args, **kwargs):
-        body_arguments = self.request_body
-        kwargs.update(body_arguments)
+        req_body = self.request_body
+
+        if req_body is not None:  # body_arguments
+            kwargs.update(req_body)
 
         resp = dict(code=5200, message=['success'])
         try:
diff --git a/src/aloha/service/http/base_api_handler.py b/src/aloha/service/http/base_api_handler.py
@@ -51,7 +51,7 @@ def request_body(self) -> dict:
         body_arguments: dict = Optional[None]
 
         if content_type.startswith('multipart/form-data'):  # only parse files when 'Content-Type' starts with 'multipart/form-data'
-            body_arguments = self.request.body_arguments
+            body_arguments = self.request_param  # self.request.body_arguments
         else:
             try:
                 body = self.request.body.decode('utf-8')
@@ -62,8 +62,16 @@ def request_body(self) -> dict:
 
     @property
     def request_param(self) -> dict:
-        url_arguments: dict = {k: v[0].decode('utf-8') for k, v in self.request.arguments.items()}
-        return url_arguments
+        ret: dict = {}
+        for k, v in self.request.arguments.items():
+            val = v[0].decode('utf-8')
+            try:
+                value = json.loads(val)
+            except json.JSONDecodeError:
+                value = val
+            ret[k] = value
+
+        return ret
 
 
 class DefaultHandler404(AbstractApiHandler):
diff --git a/src/aloha/service/http/files.py b/src/aloha/service/http/files.py
@@ -0,0 +1,33 @@
+import time
+
+import requests
+
+from ...logger import LOG
+
+
+def iter_over_request_files(request, url_files):
+    for file_key, files in request.files.items():  # iter over files uploaded by multipart
+        for f in files:
+            file_name, content_type = f["filename"], f["content_type"]
+            body = f.get('body', b"")
+            LOG.info(f"File {file_name} from multipart has content type {content_type} and length bytes={len(body)}")
+            yield file_key, file_name, content_type, body
+
+    for file_key, list_url in {'url_files': url_files or []}.items():  # iter over files specified by `url_files`
+        for url in sorted(set(list_url)):
+            try:
+                t_start = time.time()
+                resp = requests.get(url, stream=True)  # download the file from given url
+                if resp.status_code == 200:
+                    body = resp.content
+                    content_type = resp.headers.get("Content-Type", "UNKNOWN")
+                else:
+                    raise RuntimeError("Failed to download file after %s seconds with code=%s from URL %s" % (
+                        time.time() - t_start, resp.status_code, url
+                    ))
+                del resp
+            except Exception as e:
+                raise e
+            t_cost = time.time() - t_start
+            LOG.info(f"File {url} has content type {content_type} and length bytes={len(body)}, downloaded in {t_cost} seconds")
+            yield 'url_files', url, content_type, body
diff --git a/src/aloha/service/streamer/redis.py b/src/aloha/service/streamer/redis.py
@@ -5,9 +5,13 @@
 import threading
 import time
 
-from redis import Redis
-
 from .base import BaseStreamer, BaseWorker, TIMEOUT, TIME_SLEEP, logger
+from ...logger import LOG
+
+try:
+    from redis import Redis
+except ImportError:
+    LOG.warn('redis not installed, service.streamer.RedisStreamer will no be available!')
 
 
 class RedisWorker(BaseWorker):
diff --git a/src/aloha/times/timeout_async.py b/src/aloha/times/timeout_async.py
diff --git a/src/setup.py b/src/setup.py

Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,7 @@ def main():`
`6`	`6`	`modules_to_load = [`
`7`	`7`	`"app_common.api.api_common_sys_info",`
`8`	`8`	`"app_common.api.api_common_query_postgres",`
	`9`	`+ "app_common.api.api_multipart",`
`9`	`10`	`]`
`10`	`11`
`11`	`12`	`if 'service' not in SETTINGS.config:`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ def __init__(self, db_config, **kwargs):`
`24`	`24`	`try:`
`25`	`25`	`self.db = create_engine(`
`26`	`26`	`'mysql+pymysql://{user}:{password}@{host}:{port}/{dbname}'.format(**self._config),`
`27`		`- encoding='utf-8', pool_size=50, pool_recycle=500, pool_pre_ping=True, **kwargs`
	`27`	`+ pool_size=50, pool_recycle=500, pool_pre_ping=True, **kwargs`
`28`	`28`	`)`
`29`	`29`	`LOG.debug("MySQL connected: {host}:{port}/{dbname}".format(**self._config))`
`30`	`30`	`except Exception as e:`