基于 chatmusician 构建中国传统音乐数据集训练,模型,微调
运行 chatmusician
git clone https://github.com/hf-lin/chatmusician
pip install -r requirements.txt
apt-get update
apt-get install abcmidi
cd chatmusician/
python model/infer/chatmusician_web_demo.py -c "m-a-p/chatmusician" --server_port 8888
cd chatmusician/
python model/infer/predict.py --base_model {merged_model_path} --with_prompt --interactive
整理收集,
# 把音频转为乐谱
python convert_abc.py
# 将文本数据转换为模型可以处理的 token IDs
cd chatmusician
python model/train/data_preprocess.py \
-t m-a-p/chatmusician-Base \
-i /path/to/your/dataset \
-o datasets \
--tokenize_fn sft
训练:
# 微调
./model/train/scripts/train.sh datasets m-a-p/chatmusician-Base
# 合并 Peft 模型
cd chatmusician/
python model/train/merge.py --ori_model_dir m-a-p/chatmusician-Base --model_dir /path/to/lora/checkpoint --output_dir /path/to/output
# 测试
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("m-a-p/chatmusician-Base")
def preprocess_data(data):
tokenized_data = []
for sample in data:
input_text = sample["input"]
output_text = sample["output"]
input_ids = tokenizer.encode(input_text, return_tensors="pt")
output_ids = tokenizer.encode(output_text, return_tensors="pt")
tokenized_data.append({"input_ids": input_ids, "output_ids": output_ids})
return tokenized_data
## 假设 data 是你的中国传统音乐数据集
preprocessed_data = preprocess_data(data)