import locale
import os
import math
import re
import datetime
import dateutil.parser
from typing import Dict, Tuple, List
import urllib.request
import urllib.parse
"""
ニコニコ実況のコメントをスレッドごとに保存するpythonスクリプト
保存場所は./logs/
ライセンス:NYSL http://www.kmonos.net/nysl/
todo
・コメントのxmlのバリデーションが不十分。コメントを削除された時は?コメント本文に</chat> がある場合は?
コメントxmlのレアパターン
・コメントに改行が入る場合がある。ので、正規表現はdotall必須。
・vposが-4707799 とかのすごい値になる事がある。dateプロパティと前後のコメントは普通。
"""
class ChatXml:
def __init__(self, input: str) -> None:
self.originalXml = input
self.formattedXml = self._chatElementInsertStr(input)
self.xmlData = self._getChatXmlData(input)
self.number = self.xmlData[0]
def numerVposDateFormatStr(self) -> str:
dateStr = f"{self.xmlData[2]:%Y/%m/%d %H:%M:%S}"
return f"no = {self.xmlData[0]:>10} , vpos = {int(self.xmlData[1])} , date = {dateStr}"
def _getChatXmlData(self, input: str) -> Tuple[int, int, datetime.datetime]:
"""
xmlの文字列から、no vpos dateのプロパティを取得する。dateはdate型にする
"""
noMatch = re.findall(r"no=\"(\d+)\"", input)
vposMatch = re.findall(r"vpos=\"(-?\d+)\"", input)
dateMatch = re.findall(r"date=\"(\d+)\"", input)
if len(noMatch) != 1:
raise Exception("no属性が0個もしくは2個以上")
if len(vposMatch) != 1:
raise Exception("vpos属性が0個もしくは2個以上")
if len(dateMatch) != 1:
raise Exception("date属性が0個もしくは2個以上")
return int(noMatch[0]), int(vposMatch[0]), datetime.datetime.fromtimestamp(int(dateMatch[0]))
def _chatElementInsertStr(self, input: str) -> str:
"""
<chat thread="1270407602" no="14239" vpos="7637459" date="1270483976" name="hoge" user_id="719" premium="3">アニヲタ</chat>
を、人間に見やすいように↓にする
<chat date_str="2020/01/01(月)00:00:00" vpos_str="00:00.000" thread="1270407602" no="14239" vpos="7637459" date="1270483976" name="hoge" user_id="719" premium="3">アニヲタ</chat>
"""
if input.startswith("<chat ") == False:
raise Exception(f"chatエレメントが検出できない。 \"{input}\"")
xmlData = self._getChatXmlData(input)
locale.setlocale(locale.LC_ALL, 'ja_JP.UTF-8')
fDate = f"{xmlData[2]:%Y/%m/%d(%a)%H:%M:%S}"
fVpos = f"{(int(xmlData[1])/10):>10.1f}"
vposInt = int(xmlData[1])/100
if vposInt < 3600:
# 0分0.0秒
min = math.floor(vposInt/60)
sec = vposInt % 60
fVpos = f"{min}:{sec:05.2f}"
else:
# 0時間00分0.0秒
hour = math.floor(vposInt/3600)
min = math.floor(vposInt/60) % 60
sec = vposInt % 60
fVpos = f"{hour}:{min:02}:{sec:05.2f}"
result = input.replace("<chat ", f"<chat date_str=\"{fDate}\" vpos_str=\"{fVpos}\" ")
return result
class Jikkyo:
def __init__(self, cookie: str, jkId: str, startDateUnixTimeSec: int) -> None:
self.cookie = cookie
self.jkId = jkId
self.startDateUnixTimeSec = startDateUnixTimeSec
self.getFlv = {} # type: Dict[str,str]
self.waybackKey = ""
def start(self):
self._getFlv()
self._getWaybackKey()
self._getThread()
def _getFlv(self):
# endTimeは不要?
url = f"http://jk.nicovideo.jp/api/v2/getflv?v={self.jkId}&start_time={self.startDateUnixTimeSec}"
headers = {
'Content-Type': 'application/json',
"Cookie": f"user_session={self.cookie}"
}
req = urllib.request.Request(url, None, headers)
result: Dict[str, str] = {}
with urllib.request.urlopen(req) as res:
apiResponse = str(res.read().decode("utf-8")).split("&")
for a in apiResponse:
[b, c] = a.split("=", 1)
result[b] = urllib.parse.unquote(c)
self.getFlv = result
if True:
print(f"thread_id : {unixTimeToStr(float(result['thread_id']))}")
print(f"base_time : {unixTimeToStr(float(result['base_time']))}")
print(f"open_time : {unixTimeToStr(float(result['open_time']))}")
print(f"start_time : {unixTimeToStr(float(result['start_time']))}")
print(f"end_time : {unixTimeToStr(float(result['end_time']))}")
if result['thread_id'] != result['base_time'] or result['base_time'] != result['open_time'] or result['open_time'] != result['start_time']:
raise Exception("一致するはずの値が不一致")
def _getWaybackKey(self):
url = f"http://jk.nicovideo.jp/api/v2/getwaybackkey?thread={self.getFlv['thread_id']}"
headers = {
'Content-Type': 'application/json',
"Cookie": f"user_session={self.cookie}"
}
req = urllib.request.Request(url, None, headers)
with urllib.request.urlopen(req) as res:
apiResponse = str(res.read().decode("utf-8")).split("&")
for a in apiResponse:
[b, c] = a.split("=", 1)
if b == "waybackkey":
self.waybackKey = urllib.parse.unquote(c)
return
raise Exception("waybackKeyが取得出来ませんでした")
def _getThread(self):
apiVersion = "20061206"
whenParameter = self.getFlv['end_time']
serverHost = f"{self.getFlv['ms']}:{self.getFlv['http_port']}"
userId = self.getFlv['user_id']
totalResult = [] # type : list[ChatXml]
while True:
print(f"thread request. when = {datetime.datetime.fromtimestamp(int(whenParameter)):%Y/%m/%d %H:%M:%S}")
url = f"http://{serverHost}/api/thread?thread={self.getFlv['thread_id']}&res_from=-1000&version={apiVersion}&when={whenParameter}&user_id={userId}&waybackkey={self.waybackKey}"
req = urllib.request.Request(url)
with urllib.request.urlopen(req) as res:
apiResponse = str(res.read().decode("utf-8"))
chatMatchresult = re.findall(r"<chat .+?</chat>", apiResponse, re.DOTALL)
if len(chatMatchresult) == 0:
break
print("↓start")
print("\n".join([ChatXml(i).numerVposDateFormatStr() for i in chatMatchresult[0:2]]))
print("↓end")
print("\n".join([ChatXml(i).numerVposDateFormatStr() for i in chatMatchresult[-2:]]))
print("end")
minDateUnixTimeSec = 9999999999
maxValue = 0
formattedChatXml = [] # type: List[str]
for chat in chatMatchresult:
chatXml = ChatXml(chat)
totalResult.append(chatXml)
maxValue = max(maxValue, chatXml.number)
minDateUnixTimeSec = min(minDateUnixTimeSec, int(chatXml.xmlData[2].timestamp()))
formattedChatXml.append(chat)
if len(chatMatchresult) == 1:
# 最後は1つしか返ってこない
break
whenParameter = minDateUnixTimeSec
if len(totalResult) == 0:
print("no chat xml reseived")
else:
self._saveResult(totalResult)
def _saveResult(self, chatXmlList: List[ChatXml]):
chatXmlList = sorted(chatXmlList, key=lambda x: x.number)
os.makedirs(f"./logs/{self.jkId}", exist_ok=True)
threadNo = int(self.getFlv["thread_id"])
endDateObj = datetime.datetime.fromtimestamp(int(self.getFlv['end_time']))
startDateObj = datetime.datetime.fromtimestamp(int(self.getFlv['start_time']))
endDate = f"{endDateObj:%Y年%m月%d日(%a)%H時%M分%S秒}"
startDate = f"{startDateObj:%Y年%m月%d日(%a)%H時%M分%S秒}"
# ファイル名は人間が読みやすい方式。nicojk系ツールと同じフォーマットにする場合はここを編集
# 1270407602-__15257.res_2010年04月05日(月)04時00分02秒~2010年04月06日(火)04時04分56秒
saveFileName = f"{threadNo}-{len(chatXmlList):_>7}.res_{startDate}~{endDate}.txt"
#saveFileName = f"{threadNo}.txt"
saveFilePath = f"./logs/{self.jkId}/{saveFileName}"
with open(saveFilePath, mode='w', encoding="utf-8") as f:
for chatXml in chatXmlList:
f.write(chatXml.formattedXml)
f.write("\n")
print(f"save {len(chatXmlList)} comments. {saveFilePath}")
def unixTimeToStr(unixTimeSec: float) -> str:
dt = datetime.datetime.fromtimestamp(unixTimeSec)
return f"{dt:%Y/%m/%d %H:%M:%S}"
def getUnixTimeSec(str: str) -> int:
a = dateutil.parser.parse(str).timestamp()
return int(a)
if __name__ == "__main__":
# cookie = "user_session_0000000000000_0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"
cookie = "xxxxxx"
startDate = getUnixTimeSec("2011/03/11 04:04:00")
channel = "jk1"
jikkyo = Jikkyo(cookie, channel, startDate)
jikkyo.start()