#把news_tensite_xml.dat转换为utf8,然后取content内容存储到corpus.txt cat news_tensite_xml.dat | iconv -f gbk -t utf-8 -c | grep "<content>" > corpus.txt