From 64b31623ac8a2c4cf5347f6fbbf7257ee61a5437 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 24 Feb 2025 16:39:26 +0100 Subject: [PATCH] generalize script --- .gitignore | 174 +++++++++++++++++++++++++++ scripts/translation.py | 78 ++++++++++++ scripts/{vi-translation.py => vi.py} | 87 ++------------ units/vi/unit0/introduction.mdx | 6 +- 4 files changed, 262 insertions(+), 83 deletions(-) create mode 100644 .gitignore create mode 100644 scripts/translation.py rename scripts/{vi-translation.py => vi.py} (57%) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0a19790 --- /dev/null +++ b/.gitignore @@ -0,0 +1,174 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc diff --git a/scripts/translation.py b/scripts/translation.py new file mode 100644 index 0000000..9aa6c0a --- /dev/null +++ b/scripts/translation.py @@ -0,0 +1,78 @@ +import os +import sys +import re +from huggingface_hub import InferenceClient + +# Get the directory containing the current script +script_dir = os.path.dirname(os.path.abspath(__file__)) +default_inp_dir = os.path.join(script_dir, '..', 'units/en') +default_model = "deepseek-ai/DeepSeek-R1" +default_client = InferenceClient( + provider="together", + # api_key is read from the environment +) + +def auto_translate( + output_lang: str, + prompt: callable, + inp_dir: str = default_inp_dir, + model: str = default_model, + client: InferenceClient = default_client +): + get_output_path = lambda x: x.replace('/en', f'/{output_lang}') + escape_special_tokens = lambda x: x.replace('', '<%%think%%>').replace('', '<%%/think%%>') + unescape_special_tokens = lambda x: x.replace('<%%think%%>', '').replace('<%%/think%%>', '') + + # Get the list of all files in the directory, recursively + inp_files: list[str] = [] + print('Collecting files...') + for root, dirs, files in os.walk(inp_dir): + for file in files: + if file.endswith('.mdx') or file == "_toctree.yml": + fname = os.path.join(root, file) + print(' +', fname) + inp_files.append(fname) + + def write_out_file(fpath: str, content: str): + base_path = os.path.dirname(fpath) + os.makedirs(base_path, exist_ok=True) + with open(fpath, 'w', encoding='utf-8') as f: + f.write(content) + + # Read the content of the file and process + for i, inp_file in enumerate(inp_files): + out_file = get_output_path(inp_file) + if os.path.exists(out_file): + print(f'[{i+1}/{len(inp_files)}] Skipping file: {inp_file}') + continue + with open(inp_file, 'r', encoding='utf-8') as f: + content: str = f.read() + content = escape_special_tokens(content) + if content.strip() == "": + print(f'[{i+1}/{len(inp_files)}] Skipping empty file: {inp_file}') + write_out_file(out_file, "") + continue + + print(f'[{i+1}/{len(inp_files)}] Processing file: {inp_file}') + stream = client.chat.completions.create( + model=model, + temperature=0.0, + messages=[ + {"role": "user", "content": prompt(content)}, + ], + stream=True, + ) + final_text = "" + for chunk in stream: + print(chunk.choices[0].delta.content, end="") + sys.stdout.flush() + final_text += chunk.choices[0].delta.content + # Optionally filter ... reasoning process + final_text = final_text.split('').pop().strip() + # Write the output to the file + final_text = unescape_special_tokens(final_text) + write_out_file(out_file, final_text) + print() + print(f' -> Translated to: {out_file}') + print("--" * 20) + #break diff --git a/scripts/vi-translation.py b/scripts/vi.py similarity index 57% rename from scripts/vi-translation.py rename to scripts/vi.py index e6a98a5..215792e 100644 --- a/scripts/vi-translation.py +++ b/scripts/vi.py @@ -1,9 +1,8 @@ -import os -import sys -import re -from huggingface_hub import InferenceClient +from translation import auto_translate -PROMPT = lambda content: f''' +output_lang = "vi" + +prompt = lambda content: f''' You are a translator for the Vietnamese translation team. You are tasked with translating the following text into Vietnamese. You must follow these instructions: - Translate the text into Vietnamese, while keeping the original formatting (either Markdown, MDX or HTML) - Inside code blocks, translate the comments but leave the code as-is ; If the code block contains quite plain texts, you MUST provide the translation in
tag. @@ -70,79 +69,7 @@ Please translate the following text to vietnamese: === END OF TEXT === '''.strip() -# Get the directory containing the current script -script_dir = os.path.dirname(os.path.abspath(__file__)) -inp_dir = os.path.join(script_dir, '..', 'units/en') -get_our_path = lambda x: x.replace('/en', '/vi') -model = "deepseek-ai/DeepSeek-R1" -client = InferenceClient( - provider="together", - # api_key is read from the environment +auto_translate( + prompt=prompt, + output_lang=output_lang, ) - -def auto_translate( - inp_dir: str, - get_our_path: callable, - model: str, - client: InferenceClient, - PROMPT: callable -): - escape_special_tokens = lambda x: x.replace('', '<%%think%%>').replace('', '<%%/think%%>') - unescape_special_tokens = lambda x: x.replace('<%%think%%>', '').replace('<%%/think%%>', '') - - # Get the list of all files in the directory, recursively - inp_files: list[str] = [] - print('Collecting files...') - for root, dirs, files in os.walk(inp_dir): - for file in files: - if file.endswith('.mdx') or file == "_toctree.yml": - fname = os.path.join(root, file) - print(' +', fname) - inp_files.append(fname) - - def write_out_file(fpath: str, content: str): - base_path = os.path.dirname(fpath) - os.makedirs(base_path, exist_ok=True) - with open(fpath, 'w', encoding='utf-8') as f: - f.write(content) - - # Read the content of the file and process - for i, inp_file in enumerate(inp_files): - out_file = get_our_path(inp_file) - if os.path.exists(out_file): - print(f'[{i+1}/{len(inp_files)}] Skipping file: {inp_file}') - continue - with open(inp_file, 'r', encoding='utf-8') as f: - content: str = f.read() - content = escape_special_tokens(content) - if content.strip() == "": - print(f'[{i+1}/{len(inp_files)}] Skipping empty file: {inp_file}') - write_out_file(out_file, "") - continue - - print(f'[{i+1}/{len(inp_files)}] Processing file: {inp_file}') - stream = client.chat.completions.create( - model=model, - temperature=0.0, - messages=[ - {"role": "user", "content": PROMPT(content)}, - ], - stream=True, - ) - final_text = "" - for chunk in stream: - print(chunk.choices[0].delta.content, end="") - sys.stdout.flush() - final_text += chunk.choices[0].delta.content - # Optionally filter ... reasoning process - final_text = final_text.split('').pop().strip() - # Write the output to the file - final_text = unescape_special_tokens(final_text) - write_out_file(out_file, final_text) - print() - print(f' -> Translated to: {out_file}') - print("--" * 20) - #break - -if __name__ == '__main__': - auto_translate(inp_dir, get_our_path, model, client, PROMPT) diff --git a/units/vi/unit0/introduction.mdx b/units/vi/unit0/introduction.mdx index 18686a2..b6a6fbd 100644 --- a/units/vi/unit0/introduction.mdx +++ b/units/vi/unit0/introduction.mdx @@ -12,7 +12,7 @@ Khóa học miễn phí này sẽ dẫn dắt bạn từ **người mới bắt Chương đầu tiên sẽ giúp bạn làm quen: -- Khám phá **syllabus của khóa học**. +- Khám phá **tổng quan khóa học**. - **Chọn lộ trình** phù hợp (tự học hoặc theo quy trình cấp chứng chỉ). - **Nhận thông tin chi tiết về quy trình cấp chứng chỉ và deadline**. - Làm quen với đội ngũ xây dựng khóa học. @@ -52,9 +52,9 @@ Khóa học bao gồm: Sau khi hoàn thành khóa học, bạn có thể gửi phản hồi [👉 qua form này](https://docs.google.com/forms/d/e/1FAIpQLSe9VaONn0eglax0uTwi29rIn4tM7H2sYmmybmG5jJNlE5v0xA/viewform?usp=dialog) -## Syllabus khóa học [[syllabus]] +## Tổng quan khóa học [[syllabus]] -Đây là **syllabus tổng quan**. Danh sách chi tiết sẽ được cập nhật cùng mỗi Chương. +Đây là **tổng quan khóa học**. Danh sách chi tiết sẽ được cập nhật cùng mỗi Chương. | Chương | Chủ đề | Mô tả | | :---- | :---- | :---- |