本地加载

环境依赖:

pip install -i https://pypi.tuna.tsinghua.edu.cn/simple transformer sentencepiece
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124

模型下载:
https://hf-mirror.com/facebook/nllb-200-distilled-600M
支持的语言:
https://hf-mirror.com/facebook/nllb-200-distilled-600M/blob/main/README.md

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("/data/models/facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("/data/models/facebook/nllb-200-distilled-600M")

content = """
A database of Chinese surnames and Chinese given names (1930-2008). This database contains nationwide frequency statistics of 1,806 Chinese surnames and 2,614 Chinese characters used in given names, covering about 1.2 billion Han Chinese population (96.8% of the Han Chinese household-registered population born from 1930 to 2008 and still alive in 2008). This package also contains a function for computing multiple features of Chinese surnames and Chinese given names for scientific research (e.g., name uniqueness, name gender, name valence, and name warmth/competence).
"""

inputs = tokenizer(content, return_tensors="pt")
translated_tokens = model.generate(
    **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids("zho_Hans"), num_beams=4,
)
for translated in tokenizer.batch_decode(translated_tokens, skip_special_tokens=True):
    print(translated)

拆分长文本

sudo apt-get install -y g++
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple stopes[mono] botok khmer-nltk laonlp
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from stopes.pipelines.monolingual.utils.sentence_split import get_split_algo

tokenizer = AutoTokenizer.from_pretrained("/data/models/facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("/data/models/facebook/nllb-200-distilled-600M")

content = """
A database of Chinese surnames and Chinese given names (1930-2008). This database contains nationwide frequency statistics of 1,806 Chinese surnames and 2,614 Chinese characters used in given names, covering about 1.2 billion Han Chinese population (96.8% of the Han Chinese household-registered population born from 1930 to 2008 and still alive in 2008). This package also contains a function for computing multiple features of Chinese surnames and Chinese given names for scientific research (e.g., name uniqueness, name gender, name valence, and name warmth/competence).
"""


# now split the content into individual sentences, just as NLLB was supposed to work!
splitter = get_split_algo("eng", "default")
input_sentences = list(splitter(content))
print(len(input_sentences))  # 3

inputs = tokenizer(input_sentences, return_tensors="pt", padding=True)
translated_tokens = model.generate(
    **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids("zho_Hans"), num_beams=4,
)
for translated in tokenizer.batch_decode(translated_tokens, skip_special_tokens=True):
    print(translated)

cuda加载

import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# 检查CUDA是否可用
if not torch.cuda.is_available():
    raise RuntimeError("CUDA is not available on this machine.")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained("/data/models/facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("/data/models/facebook/nllb-200-distilled-600M")

# 将模型移动到GPU
model.to(device)

content = """
A database of Chinese surnames and Chinese given names (1930-2008). This database contains nationwide frequency statistics of 1,806 Chinese surnames and 2,614 Chinese characters used in given names, covering about 1.2 billion Han Chinese population (96.8% of the Han Chinese household-registered population born from 1930 to 2008 and still alive in 2008). This package also contains a function for computing multiple features of Chinese surnames and Chinese given names for scientific research (e.g., name uniqueness, name gender, name valence, and name warmth/competence).
"""

# 对输入进行编码,并将结果张量移动到GPU
inputs = tokenizer(content, return_tensors="pt").to(device)

translated_tokens = model.generate(
    **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids("zho_Hans"), num_beams=4,
)

# 将生成的token解码为文本
for translated in tokenizer.batch_decode(translated_tokens, skip_special_tokens=True):
    print(translated)

del inputs
torch.cuda.empty_cache()

pipeline

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

tokenizer = AutoTokenizer.from_pretrained("/data/models/facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("/data/models/facebook/nllb-200-distilled-600M")

translator = pipeline(
    'translation',
    model=model,
    tokenizer=tokenizer,
    src_lang='zho_Hans',
    tgt_lang='eng_Latn',
    max_length=512
)
print(translator(["你好 世界", "青春", ]))
Logo

魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。

更多推荐