fish 1 week ago
parent
commit
a8cb6f0cfa
2 changed files with 131 additions and 0 deletions
  1. 73 0
      README.md
  2. 58 0
      convert_abc.py

+ 73 - 0
README.md

@@ -1,2 +1,75 @@
 # chatmusician
 
+基于 chatmusician 构建中国传统音乐数据集训练,模型,微调
+
+## Develop
+
+运行 chatmusician
+```
+git clone https://github.com/hf-lin/chatmusician
+
+pip install -r requirements.txt 
+
+apt-get update
+apt-get install abcmidi
+
+cd chatmusician/
+python model/infer/chatmusician_web_demo.py -c "m-a-p/chatmusician" --server_port 8888
+
+
+cd chatmusician/
+python model/infer/predict.py --base_model {merged_model_path} --with_prompt --interactive
+
+```
+- Python 3.8
+- Pytorch 2.0
+- CUDA 11.4
+- Deepspeed 0.10
+
+
+## 训练微调
+
+整理收集,
+```
+# 把音频转为乐谱
+python convert_abc.py
+
+# 将文本数据转换为模型可以处理的 token IDs
+cd chatmusician
+python model/train/data_preprocess.py \
+    -t m-a-p/chatmusician-Base \
+    -i /path/to/your/dataset \
+    -o datasets \
+    --tokenize_fn sft 
+
+```
+
+训练:
+```
+# 微调
+./model/train/scripts/train.sh datasets m-a-p/chatmusician-Base
+
+# 合并 Peft 模型
+cd chatmusician/
+python model/train/merge.py --ori_model_dir m-a-p/chatmusician-Base --model_dir /path/to/lora/checkpoint --output_dir /path/to/output
+
+
+# 测试
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("m-a-p/chatmusician-Base")
+
+def preprocess_data(data):
+    tokenized_data = []
+    for sample in data:
+        input_text = sample["input"]
+        output_text = sample["output"]
+        input_ids = tokenizer.encode(input_text, return_tensors="pt")
+        output_ids = tokenizer.encode(output_text, return_tensors="pt")
+        tokenized_data.append({"input_ids": input_ids, "output_ids": output_ids})
+    return tokenized_data
+
+## 假设 data 是你的中国传统音乐数据集
+preprocessed_data = preprocess_data(data)
+
+```

+ 58 - 0
convert_abc.py

@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2025/03/18 13:45:29
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   
+
+批量把音频转换为abc乐谱格式
+pip install pydub music21
+
+apt-get update
+apt-get install ffmpeg
+
+'''
+
+import os
+from pydub import AudioSegment
+from music21 import *
+
+def audio_to_score(audio_file_path):
+    try:
+        # 读取音频文件
+        audio = AudioSegment.from_file(audio_file_path)
+        # 这里可以添加更复杂的音频特征提取逻辑,例如音高检测等
+        # 为了简化示例,我们假设已经提取到了音高和节奏信息
+        # 这里简单创建一个简单的音符序列
+        s = stream.Stream()
+        # 添加一个 C4 音符,持续时间为 1 拍
+        n = note.Note('C4', quarterLength=1)
+        s.append(n)
+        return s
+    except Exception as e:
+        print(f"音频转换为乐谱时出现错误: {e}")
+        return None
+
+def batch_audio_to_score(input_folder, output_folder):
+    # 确保输出文件夹存在
+    os.makedirs(output_folder, exist_ok=True)
+    # 遍历输入文件夹中的所有文件
+    for root, dirs, files in os.walk(input_folder):
+        for file in files:
+            if file.endswith(('.mp3', '.wav', '.ogg')):
+                audio_file_path = os.path.join(root, file)
+                score = audio_to_score(audio_file_path)
+                if score:
+                    # 生成输出文件路径
+                    output_file_name = os.path.splitext(file)[0] + '.midi'
+                    output_file_path = os.path.join(output_folder, output_file_name)
+                    # 保存乐谱
+                    score.write('midi', fp=output_file_path)
+                    print(f"已将 {audio_file_path} 转换为 {output_file_path}")
+
+if __name__ == "__main__":
+    input_folder = 'your_input_audio_folder'
+    output_folder = 'your_output_score_folder'
+    batch_audio_to_score(input_folder, output_folder)
+