# chatmusician 基于 chatmusician 构建中国传统音乐数据集训练,模型,微调 ## Develop 运行 chatmusician ``` git clone https://github.com/hf-lin/chatmusician pip install -r requirements.txt apt-get update apt-get install abcmidi cd chatmusician/ python model/infer/chatmusician_web_demo.py -c "m-a-p/chatmusician" --server_port 8888 cd chatmusician/ python model/infer/predict.py --base_model {merged_model_path} --with_prompt --interactive ``` - Python 3.8 - Pytorch 2.0 - CUDA 11.4 - Deepspeed 0.10 ## 训练微调 整理收集, ``` # 把音频转为乐谱 python convert_abc.py # 将文本数据转换为模型可以处理的 token IDs cd chatmusician python model/train/data_preprocess.py \ -t m-a-p/chatmusician-Base \ -i /path/to/your/dataset \ -o datasets \ --tokenize_fn sft ``` 训练: ``` # 微调 ./model/train/scripts/train.sh datasets m-a-p/chatmusician-Base # 合并 Peft 模型 cd chatmusician/ python model/train/merge.py --ori_model_dir m-a-p/chatmusician-Base --model_dir /path/to/lora/checkpoint --output_dir /path/to/output # 测试 from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("m-a-p/chatmusician-Base") def preprocess_data(data): tokenized_data = [] for sample in data: input_text = sample["input"] output_text = sample["output"] input_ids = tokenizer.encode(input_text, return_tensors="pt") output_ids = tokenizer.encode(output_text, return_tensors="pt") tokenized_data.append({"input_ids": input_ids, "output_ids": output_ids}) return tokenized_data ## 假设 data 是你的中国传统音乐数据集 preprocessed_data = preprocess_data(data) ```