|

# 网易云评论爬取
# 加密算法破解
import sys
import re # 正则表达式,进行文字匹配
from bs4 import BeautifulSoup # (网页解析,获取数据)
# from urllib import request
# from urllib import error
import requests
from Crypto.Cipher import AES
from base64 import b64encode
# from urllib import parse
import json
url = "https://music.163.com/weapi/comment/resource/comments/get?csrf_token="
#https://music.163.com/weapi/comment/resource/comments/get?csrf_token=
''' var bKB1x = window.asrsea(JSON.stringify(i9b), buV0x(["流泪", "强"]), buV0x(Rg4k.md), buV0x(["爱心", "女孩", "惊恐", "大笑"]));
e9f.data = j9a.cr0x({ #(d,e,f,g)
params: bKB1x.encText,
encSecKey: bKB1x.encSecKey
#buV0x(["流泪", "强"]) 放入控制台跑代码得到固定值
#'010001'
#buV0x(Rg4k.md) 同上
#00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7
#buV0x(["爱心", "女孩", "惊恐", "大笑"])
#0CoJUm6Qyw8W8jud
})
}'''
op=0
headers={'user-agent':'自己的ua'}
def remove_duplicates():
f_read = open(r'./网易云音乐评论.txt', 'r', encoding='utf-8') # 将需要去除重复值的txt文本重命名text.txt
f_write = open(r'./去除重复值后的文本.txt', 'w', encoding='utf-8') # 去除重复值之后,生成新的txt文本 --“去除重复值后的文本.txt”
data1 = set()
for a in [a.strip('\n') for a in list(f_read)]:
if a not in data1:
f_write.write(a + '\n')
data1.add(a)
f_read.close()
f_write.close()
remove_duplicates()
print('去重完成')
i = "c0Aitq6E14zqzMfy"
e = "010001"
g = "0CoJUm6Qyw8W8jud"
f = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
# 加密算法
def get_encSecKey():
return "2c8ec4bc37aa7d9aa8bb8bc9494ab7639961a1b2e3607b31241630e0902b009c61e88f5a97fc67ca92f7bdf1b1b193d131eb02268f29a83a62aae22e1e55ed9a514079ef39d5974a03e8cffdfaf44eaa78779a9e8edbf47a3ad0238aee6e7374d9fe9668644b19197b3efc1869ad207b663d35b233c5f643e394df489ba35a3f"
def get_params(data):
first = enc_params(data,g)
second = enc_params(first,i)
return second
def enc_params(data,key):#加密
iv = "0102030405060708"
data = to_16(data)
aes = AES.new(key=key.encode("utf-8"),IV=iv.encode("utf-8"),mode=AES.MODE_CBC)#创建加密器
bs = aes.encrypt(data.encode("utf-8"))
return str(b64encode(bs),"utf-8")
def to_16(data):#转换成16的倍数
pad = 16-len(data) % 16
data += chr(pad)*pad
return data
if __name__ == '__main__':
page = int(input("输入爬取页数:"))
rid = int(input("输入音乐id:"))
fp = open('./网易云音乐评论.txt', 'w', encoding='utf-8')#可添加随意歌曲
for u in range(1,page+1):
pum = u*20
data = {
"csrf_token":"",
"cursor": "-1",
"offset": "0",
"orderType": "1",
"pageNo": "1",
"pageSize": "{}".format(pum),
"rid": "R_SO_4_{}".format(rid),
"threadId": "R_SO_4_{}".format(rid)
#1922888354
}
response = requests.post(url,data={
"params":get_params(json.dumps(data)),
"encSecKey":get_encSecKey()
},headers=headers)
response.encoding='utf-8'
html = response.text
# print(html)
result = json.loads(response.content.decode('utf-8'))
# for i in result['data']['hotComments']:
# print(i['content'])
# for c in result['data']['comments']:
# print(c['content'])
for hot in range(len(result['data']['hotComments'])):
fp.write('hotComments' + ' ')
fp.write('昵称:' + result['data']['hotComments'][hot]['user']['nickname'] + '\n')
fp.write('评论:' + result['data']['hotComments'][hot]['content'] + '\n')
if result['data']['hotComments'][hot]['user']['vipRights'] == None:
fp.write('vip:yes' + '\n')
else:
fp.write('vip:no' + '\n')
fp.write('点赞数' + str(result['data']['hotComments'][hot]['likedCount']) + '\n')
fp.write('-------------------------------------' + '\n')
# print(result['data']['hotComments'][1]['user']['nickname'])
# comments
for r in range(20):
fp.write('comments')
fp.write('昵称:' + result['data']['comments'][r]['user']['nickname'] + '\n')
fp.write('评论:' + result['data']['comments'][r]['content'] + '\n')
fp.write('头像:'+result['data']['comments'][r]['user']['avatarUrl']+'\n')
if result['data']['comments'][r]['user']['vipRights'] == None:
fp.write('vip:yes' + '\n')
else:
fp.write('vip:no' + '\n')
fp.write('点赞数' + str(result['data']['comments'][r]['likedCount']) + '\n')
fp.write('-------------------------------------------------------' + '\n')
print('第{}页爬取完毕'.format(u))
if(u==page):
op = 1
fp.close()
if (op==1):
remove_duplicates()
'''
# chinese = re.findall('[\u4e00-\u9fa5]',html)
# comments = ""
# for i in chinese:
# comments+=i
# print(comments)
'''
'''function a(a) {
var d, e, b = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", c = "";
for (d = 0; a > d; d += 1)
e = Math.random() * b.length,
e = Math.floor(e),
c += b.charAt(e);
return c
}
function b(a, b) {
var c = CryptoJS.enc.Utf8.parse(b)
, d = CryptoJS.enc.Utf8.parse("0102030405060708")
, e = CryptoJS.enc.Utf8.parse(a)
, f = CryptoJS.AES.encrypt(e, c, {
iv: d,
mode: CryptoJS.mode.CBC
});
return f.toString()
}
function c(a, b, c) {
var d, e;
return setMaxDigits(131),
d = new RSAKeyPair(b,"",c),
e = encryptedString(d, a)
}
function d(d, e, f, g) {
var h = {}
, i = a(16);
h.encText = b(d, g),#b就是加密算法
h.encText = b(h.encText, i),#params的结果,两次加密
h.encSecKey = c(i, e, f),
return h
}
function e(a, b, d, e) {
var f = {};
return f.encText = c(a + e, b, d),
f
}
window.asrsea = d,'''
注意:后面的保存借鉴了CSDN上一位作者的方法:(30条消息) python爬虫--爬取网易云音乐评论_南岸青栀*的博客-CSDN博客_python爬取网易云音乐评论
这位博主写的非常详细,但是不知道为什么运行的代码收集的信息总是有重复的部分,所以我添加了去重的功能,以供大家参考,如果有大佬能说明原因,不胜感激
另外,添加了输入音乐ID爬取的功能 |
|