From 2aed5f0c7bd7126f588c2df4fc9a4185185170af Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Mon, 19 Nov 2018 21:34:58 +0800 Subject: [PATCH 001/363] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d35ae525..4ecec57f 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ - 结果文件:保存在当前目录的weibo文件夹里,名字为"user_id.txt"的形式 # 运行环境 -- 开发语言:python2.7 +- 开发语言:python2/python3 - 系统: Windows/Linux # 使用说明 From 895bdd0673987e9b4c1656a21f2dccf98965186c Mon Sep 17 00:00:00 2001 From: chenlei Date: Tue, 20 Nov 2018 18:47:36 +0800 Subject: [PATCH 002/363] =?UTF-8?q?fix:=20=E5=88=A0=E9=99=A4=E5=BE=AE?= =?UTF-8?q?=E5=8D=9A=E5=86=85=E5=AE=B9=E4=B8=AD=E5=A4=9A=E4=BD=99=E7=9A=84?= =?UTF-8?q?=E9=9B=B6=E5=AE=BD=E5=BA=A6=E5=AD=97=E7=AC=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #15 --- weiboSpider.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index b1445653..773ee5c6 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -88,7 +88,7 @@ def get_long_weibo(self, weibo_link): selector = etree.HTML(html) info = selector.xpath("//div[@class='c']")[1] wb_content = info.xpath("div/span[@class='ctt']")[0].xpath( - "string(.)").encode(sys.stdout.encoding, "ignore").decode( + "string(.)").replace(u"\u200b", "").encode(sys.stdout.encoding, "ignore").decode( sys.stdout.encoding) return wb_content except Exception as e: @@ -104,7 +104,7 @@ def get_retweet(self, is_retweet, info, wb_content): return wb_content else: original_user = original_user[0] - retweet_reason = info.xpath("div")[-1].xpath("string(.)").encode( + retweet_reason = info.xpath("div")[-1].xpath("string(.)").replace(u"\u200b", "").encode( sys.stdout.encoding, "ignore").decode( sys.stdout.encoding) retweet_reason = retweet_reason[:retweet_reason.rindex(u"赞")] @@ -139,7 +139,7 @@ def get_weibo_info(self): for i in range(0, len(info) - 2): # 微博内容 str_t = info[i].xpath("div/span[@class='ctt']") - weibo_content = str_t[0].xpath("string(.)").encode( + weibo_content = str_t[0].xpath("string(.)").replace(u"\u200b", "").encode( sys.stdout.encoding, "ignore").decode( sys.stdout.encoding) weibo_content = weibo_content[:-1] From 7f933c60a8e8c5ca61e68d212d83f3c2c5765e50 Mon Sep 17 00:00:00 2001 From: chenlei Date: Thu, 6 Dec 2018 17:58:33 +0800 Subject: [PATCH 003/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E8=8E=B7=E5=8F=96=E5=90=AB=E6=9C=89"=E6=98=BE?= =?UTF-8?q?=E7=A4=BA=E5=9C=B0=E5=9B=BE"=E8=80=8C=E6=B2=A1=E6=9C=89?= =?UTF-8?q?=E4=BD=8D=E7=BD=AE=E4=BF=A1=E6=81=AF=E7=9A=84=E6=83=85=E5=86=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #33 --- weiboSpider.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 773ee5c6..74abdba9 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -168,14 +168,18 @@ def get_weibo_info(self): for a in a_list: if ("place.weibo.com" in a.xpath("@href")[0] and a.xpath("text()")[0] == u"显示地图"): - weibo_place = div_first.xpath( - "span[@class='ctt']/a")[-1] - if u"的秒拍视频" in div_first.xpath("span[@class='ctt']/a/text()")[-1]: - weibo_place = div_first.xpath( - "span[@class='ctt']/a")[-2] - weibo_place = weibo_place.xpath("string(.)").encode( - sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) - break + weibo_a = div_first.xpath( + "span[@class='ctt']/a") + if len(weibo_a) >= 1: + weibo_place = weibo_a[-1] + if u"的秒拍视频" in div_first.xpath("span[@class='ctt']/a/text()")[-1]: + if len(weibo_a) >= 2: + weibo_place = weibo_a[-2] + else: + weibo_place = u"无" + weibo_place = weibo_place.xpath("string(.)").encode( + sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) + break self.weibo_place.append(weibo_place) print(u"微博位置: " + weibo_place) @@ -308,8 +312,8 @@ def start(self): def main(): try: # 使用实例,输入一个用户id,所有信息都会存储在wb实例中 - user_id = 5053084638 # 可以改成任意合法的用户id(爬虫的微博id除外) - filter = 0 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 + user_id = 1476938315 # 可以改成任意合法的用户id(爬虫的微博id除外) + filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 wb = Weibo(user_id, filter) # 调用Weibo类,创建微博实例wb wb.start() # 爬取微博信息 print(u"用户名: " + wb.username) From 3ac980a6556bc487f30c6b0a2229f6ab5f3152a7 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sat, 8 Dec 2018 20:39:44 +0800 Subject: [PATCH 004/363] Update README.md --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4ecec57f..3904b0d0 100644 --- a/README.md +++ b/README.md @@ -60,11 +60,13 @@ $ python filepath/weibospider.py # 如何获取cookie 1.用Chrome打开
-2.按F12键打开Chrome开发者工具;
-3.点开“Network”,将“Preserve log”选中,输入微博的用户名、密码,登录,如图所示: +2.输入微博的用户名、密码,登录,如图所示: ![](https://picture.cognize.me/cognize/github/weibospider/cookie1.png) -4.点击Chrome开发者工具“Name"列表中的"m.weibo.cn",点击"Headers",其中"Request Headers"下,"Cookie"后的值即为我们要找的cookie值,复制即可,如图所示: +登录成功后会跳转到;
+3.按F12键打开Chrome开发者工具,在地址栏输入并跳转到,跳转后会显示如下类似界面:
![](https://picture.cognize.me/cognize/github/weibospider/cookie2.png) +4.点击Chrome开发者工具“Name"列表中的"weibo.cn",点击"Headers",其中"Request Headers"下,"Cookie"后的值即为我们要找的cookie值,复制即可,如图所示: +![](https://picture.cognize.me/cognize/github/weibospider/cookie3.png) # 如何获取user_id 1.打开网址,搜索我们要找的人,如”郭碧婷“,进入她的主页;
From 3b7deb9c04707f368708403ffdeefb60d8bfa8dd Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sat, 8 Dec 2018 21:58:19 +0800 Subject: [PATCH 005/363] Update README.md --- README.md | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 3904b0d0..2e6245e8 100644 --- a/README.md +++ b/README.md @@ -11,10 +11,12 @@ - 关注数:用户关注的微博账号数量 - 粉丝数:用户的粉丝数 - 微博内容:以list的形式存储了用户所有微博内容 +- 微博位置:以list的形式存储了用户所有微博的发布位置 - 微博发布时间:以list的形式存储了用户所有微博的发布时间 - 微博对应的点赞数:以list的形式存储了用户所有微博对应的点赞数 - 微博对应的转发数:以list的形式存储了用户所有微博对应的转发数 - 微博对应的评论数:以list的形式存储了用户所有微博对应的评论数 +- 微博发布工具:以list的形式存储了用户所有微博的发布工具,如iPhone客户端、HUAWEI Mate 20 Pro等 - 结果文件:保存在当前目录的weibo文件夹里,名字为"user_id.txt"的形式 # 运行环境 @@ -22,15 +24,21 @@ - 系统: Windows/Linux # 使用说明 -1.下载脚本 +## 1.下载脚本 ```bash $ git clone https://github.com/dataabc/weibospider.git ``` -运行上述命令,将本项目下载到当前目录,如果下载成功当前目录会出现一个名为"weibospider"的文件夹;
-2.用文本编辑器打开weibospider文件夹下的"weibospider.py"文件;
-3.将"weibospider.py"文件中的“your cookie”替换成爬虫微博的cookie,后面会详细讲解如何获取cookie;
-4.将"weibospider.py"文件中的user_id替换成想要爬取的微博的user_id,后面会详细讲解如何获取user_id;
-5.按需求调用脚本。本脚本是一个Weibo类,用户可以按照自己的需求调用Weibo类。 +运行上述命令,将本项目下载到当前目录,如果下载成功当前目录会出现一个名为"weibospider"的文件夹; +## 2.设置cookie和user_id +打开weibospider文件夹下的"**weibospider.py**"文件,将“**your cookie**”替换成爬虫微博的cookie,后面会详细讲解如何获取cookie;将**user_id**替换成想要爬取的微博的user_id,后面会详细讲解如何获取user_id; +## 3.运行脚本 +大家可以根据自己的运行环境选择运行方式,Linux可以通过 +```bash +$ python weibospider.py +``` +运行; +## 4.按需求修改脚本(可选) +本脚本是一个Weibo类,用户可以按照自己的需求调用Weibo类。 例如用户可以直接在"weibospider.py"文件中调用Weibo类,具体调用代码示例如下: ```python user_id = 1669879400 @@ -38,32 +46,26 @@ filter = 1 wb = Weibo(user_id,filter) #调用Weibo类,创建微博实例wb wb.start() #爬取微博信息 ``` -user_id可以改成任意合法的用户id(爬虫的微博id除外);filter默认值为0,表示爬取所有微博信息(转发微博+原创微博),为1表示只爬取用户的所有原创微博;wb是Weibo类的一个实例,也可以是其它名字,只要符合python的命名规范即可;通过执行wb.start() 完成了微博的爬取工作。在上述代码之后,我们可以得到很多信息:
+user_id可以改成任意合法的用户id(爬虫的微博id除外);filter默认值为0,表示爬取所有微博信息(转发微博+原创微博),为1表示只爬取用户的所有原创微博;wb是Weibo类的一个实例,也可以是其它名字,只要符合python的命名规范即可;通过执行wb.start() 完成了微博的爬取工作。在上述代码执行后,我们可以得到很多信息:
**wb.username**:用户名;
**wb.weibo_num**:微博数;
**wb.following**:关注数;
**wb.followers**:粉丝数;
-**wb.weibo_content**:存储用户的所有微博,为list形式,若filter=1, wb.weibo_content[0]为最新一条**原创**微博,filter=0为最新一条微博,wb.weibo_content[1]、wb.weibo_content[2]分别表示第二新和第三新的微博,以此类推。当然如果用户没有发过微博,wb.weibo_content则为[];
+**wb.weibo_content**:存储用户的所有微博,为list形式,若filter=1, wb.weibo_content[0]为最新一条**原创**微博,filter=0为最新一条微博,wb.weibo_content[1]、wb.weibo_content[2]分别表示第二新和第三新的微博,以此类推。当然如果用户没有发过微博,则wb.weibo_content为[];
+**wb.weibo_place**: 存储微博的发布位置,为list形式,如wb.weibo_place[0]为最新一条微博的发布位置,与wb.weibo_content[0]对应,如果该条微博没有位置信息,则weibo_place内容为无,其它用法同wb.weibo_content;
**wb.publish_time**: 存储微博的发布时间,为list形式,如wb.publish_time[0]为最新一条微博的发布时间,与wb.weibo_content[0]对应,其它用法同wb.weibo_content;
**wb.up_num**:存储微博获得的点赞数,为list形式,如wb.up_num[0]为最新一条微博获得的点赞数,与wb.weibo_content[0]对应,其它用法同wb.weibo_content;
**wb.retweet_num**:存储微博获得的转发数,为list形式,如wb.retweet_num[0]为最新一条微博获得的转发数,与wb.weibo_content[0]对应,其它用法同wb.weibo_content;
**wb.comment_num**:存储微博获得的评论数,为list形式,如wb.comment_num[0]为最新一条微博获得的评论数,与wb.weibo_content[0]对应,其它用法同wb.weibo_content。
-6.运行脚本。我的运行环境是IPython,通过 -```bash -$ run filepath/weibospider.py -``` -即可运行脚本,大家可以根据自己的运行环境选择运行方式; -Linux可以通过 -```bash -$ python filepath/weibospider.py -``` +**wb.publish_tool**:存储微博的发布工具,为list形式,如wb.publish_tool[0]为最新一条微博的发布工具,与wb.weibo_content[0]对应,其它用法同wb.weibo_content。 + # 如何获取cookie 1.用Chrome打开
2.输入微博的用户名、密码,登录,如图所示: ![](https://picture.cognize.me/cognize/github/weibospider/cookie1.png) -登录成功后会跳转到;
-3.按F12键打开Chrome开发者工具,在地址栏输入并跳转到,跳转后会显示如下类似界面:
+登录成功后会跳转到;
+3.按F12键打开Chrome开发者工具,在地址栏输入并跳转到,跳转后会显示如下类似界面: ![](https://picture.cognize.me/cognize/github/weibospider/cookie2.png) 4.点击Chrome开发者工具“Name"列表中的"weibo.cn",点击"Headers",其中"Request Headers"下,"Cookie"后的值即为我们要找的cookie值,复制即可,如图所示: ![](https://picture.cognize.me/cognize/github/weibospider/cookie3.png) From 607f877d2588abd6e3a636c14033f7e7602cecec Mon Sep 17 00:00:00 2001 From: chenlei Date: Tue, 25 Dec 2018 20:42:07 +0800 Subject: [PATCH 006/363] =?UTF-8?q?refactor:=20=E9=87=8D=E6=9E=84get=5Fwei?= =?UTF-8?q?bo=5Finfo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 197 ++++++++++++++++++++++++++++--------------------- 1 file changed, 114 insertions(+), 83 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 74abdba9..383256a7 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -20,10 +20,10 @@ def __init__(self, user_id, filter=0): self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 self.username = '' # 用户名,如“Dear-迪丽热巴” self.weibo_num = 0 # 用户全部微博数 - self.weibo_num2 = 0 # 爬取到的微博数 + self.weibo_num2 = 0 # 爬取到的微博数 self.following = 0 # 用户关注数 self.followers = 0 # 用户粉丝数 - self.weibo_content = [] # 微博内容 + self.weibo_content = [] # 微博内容 self.weibo_place = [] # 微博位置 self.publish_time = [] # 微博发布时间 self.up_num = [] # 微博对应的点赞数 @@ -115,7 +115,110 @@ def get_retweet(self, is_retweet, info, wb_content): print("Error: ", e) traceback.print_exc() - # 获取用户微博内容及对应的发布时间、点赞数、转发数、评论数 + # 获取微博内容 + def get_weibo_content(self, info): + try: + str_t = info.xpath("div/span[@class='ctt']") + weibo_content = str_t[0].xpath("string(.)").replace(u"\u200b", "").encode( + sys.stdout.encoding, "ignore").decode( + sys.stdout.encoding) + weibo_content = weibo_content[:-1] + weibo_id = info.xpath("@id")[0][2:] + a_link = info.xpath("div/span[@class='ctt']/a") + is_retweet = info.xpath("div/span[@class='cmt']") + if a_link: + if a_link[-1].xpath("text()")[0] == u"全文": + weibo_link = "https://weibo.cn/comment/" + weibo_id + wb_content = self.get_long_weibo(weibo_link) + if wb_content: + if not is_retweet: + wb_content = wb_content[1:] + weibo_content = wb_content + if is_retweet: + weibo_content = self.get_retweet( + is_retweet, info, weibo_content) + self.weibo_content.append(weibo_content) + print(weibo_content) + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + # 获取微博发布位置 + def get_weibo_place(self, info): + try: + div_first = info.xpath("div")[0] + a_list = div_first.xpath("a") + weibo_place = u"无" + for a in a_list: + if ("place.weibo.com" in a.xpath("@href")[0] and + a.xpath("text()")[0] == u"显示地图"): + weibo_a = div_first.xpath("span[@class='ctt']/a") + if len(weibo_a) >= 1: + weibo_place = weibo_a[-1] + if u"的秒拍视频" in div_first.xpath("span[@class='ctt']/a/text()")[-1]: + if len(weibo_a) >= 2: + weibo_place = weibo_a[-2] + else: + weibo_place = u"无" + weibo_place = weibo_place.xpath("string(.)").encode( + sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) + break + self.weibo_place.append(weibo_place) + print(u"微博位置: " + weibo_place) + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + # 获取微博发布时间 + def get_publish_time(self, info): + try: + str_time = info.xpath("div/span[@class='ct']") + str_time = str_time[0].xpath("string(.)").encode( + sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) + publish_time = str_time.split(u'来自')[0] + if u"刚刚" in publish_time: + publish_time = datetime.now().strftime( + '%Y-%m-%d %H:%M') + elif u"分钟" in publish_time: + minute = publish_time[:publish_time.find(u"分钟")] + minute = timedelta(minutes=int(minute)) + publish_time = (datetime.now() - minute).strftime( + "%Y-%m-%d %H:%M") + elif u"今天" in publish_time: + today = datetime.now().strftime("%Y-%m-%d") + time = publish_time[3:] + publish_time = today + " " + time + elif u"月" in publish_time: + year = datetime.now().strftime("%Y") + month = publish_time[0:2] + day = publish_time[3:5] + time = publish_time[7:12] + publish_time = (year + "-" + month + "-" + day + " " + time) + else: + publish_time = publish_time[:16] + self.publish_time.append(publish_time) + print(u"微博发布时间: " + publish_time) + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + # 获取微博发布工具 + def get_publish_tool(self, info): + try: + str_time = info.xpath("div/span[@class='ct']") + str_time = str_time[0].xpath("string(.)").encode( + sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) + if len(str_time.split(u'来自')) > 1: + publish_tool = str_time.split(u'来自')[1] + else: + publish_tool = u"无" + self.publish_tool.append(publish_tool) + print(u"微博发布工具: " + publish_tool) + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + # 获取用户微博信息 def get_weibo_info(self): try: url = "https://weibo.cn/u/%d?filter=%d&page=1" % ( @@ -137,90 +240,18 @@ def get_weibo_info(self): is_empty = info[0].xpath("div/span[@class='ctt']") if is_empty: for i in range(0, len(info) - 2): + # 微博内容 - str_t = info[i].xpath("div/span[@class='ctt']") - weibo_content = str_t[0].xpath("string(.)").replace(u"\u200b", "").encode( - sys.stdout.encoding, "ignore").decode( - sys.stdout.encoding) - weibo_content = weibo_content[:-1] - weibo_id = info[i].xpath("@id")[0][2:] - a_link = info[i].xpath( - "div/span[@class='ctt']/a") - is_retweet = info[i].xpath("div/span[@class='cmt']") - if a_link: - if a_link[-1].xpath("text()")[0] == u"全文": - weibo_link = "https://weibo.cn/comment/" + weibo_id - wb_content = self.get_long_weibo(weibo_link) - if wb_content: - if not is_retweet: - wb_content = wb_content[1:] - weibo_content = wb_content - if is_retweet: - weibo_content = self.get_retweet( - is_retweet, info[i], weibo_content) - self.weibo_content.append(weibo_content) - print(weibo_content) + self.get_weibo_content(info[i]) # 微博位置 - div_first = info[i].xpath("div")[0] - a_list = div_first.xpath("a") - weibo_place = u"无" - for a in a_list: - if ("place.weibo.com" in a.xpath("@href")[0] and - a.xpath("text()")[0] == u"显示地图"): - weibo_a = div_first.xpath( - "span[@class='ctt']/a") - if len(weibo_a) >= 1: - weibo_place = weibo_a[-1] - if u"的秒拍视频" in div_first.xpath("span[@class='ctt']/a/text()")[-1]: - if len(weibo_a) >= 2: - weibo_place = weibo_a[-2] - else: - weibo_place = u"无" - weibo_place = weibo_place.xpath("string(.)").encode( - sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) - break - self.weibo_place.append(weibo_place) - print(u"微博位置: " + weibo_place) + self.get_weibo_place(info[i]) # 微博发布时间 - str_time = info[i].xpath("div/span[@class='ct']") - str_time = str_time[0].xpath("string(.)").encode( - sys.stdout.encoding, "ignore").decode( - sys.stdout.encoding) - publish_time = str_time.split(u'来自')[0] - if u"刚刚" in publish_time: - publish_time = datetime.now().strftime( - '%Y-%m-%d %H:%M') - elif u"分钟" in publish_time: - minute = publish_time[:publish_time.find(u"分钟")] - minute = timedelta(minutes=int(minute)) - publish_time = ( - datetime.now() - minute).strftime( - "%Y-%m-%d %H:%M") - elif u"今天" in publish_time: - today = datetime.now().strftime("%Y-%m-%d") - time = publish_time[3:] - publish_time = today + " " + time - elif u"月" in publish_time: - year = datetime.now().strftime("%Y") - month = publish_time[0:2] - day = publish_time[3:5] - time = publish_time[7:12] - publish_time = ( - year + "-" + month + "-" + day + " " + time) - else: - publish_time = publish_time[:16] - self.publish_time.append(publish_time) - print(u"微博发布时间: " + publish_time) + self.get_publish_time(info[i]) # 微博发布工具 - if len(str_time.split(u'来自')) > 1: - publish_tool = str_time.split(u'来自')[1] - else: - publish_tool = u"无" - self.publish_tool.append(publish_tool) - print(u"微博发布工具: " + publish_tool) + self.get_publish_tool(info[i]) str_footer = info[i].xpath("div")[-1] str_footer = str_footer.xpath("string(.)").encode( @@ -276,8 +307,8 @@ def write_txt(self): u"微博位置: " + self.weibo_place[i - 1] + "\n" + u"发布时间: " + self.publish_time[i - 1] + "\n" + u"点赞数: " + str(self.up_num[i - 1]) + - u" 转发数: " + str(self.retweet_num[i - 1]) + - u" 评论数: " + str(self.comment_num[i - 1]) + "\n" + + u" 转发数: " + str(self.retweet_num[i - 1]) + + u" 评论数: " + str(self.comment_num[i - 1]) + "\n" + u"发布工具: " + self.publish_tool[i - 1] + "\n\n" ) result = result + text @@ -314,7 +345,7 @@ def main(): # 使用实例,输入一个用户id,所有信息都会存储在wb实例中 user_id = 1476938315 # 可以改成任意合法的用户id(爬虫的微博id除外) filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 - wb = Weibo(user_id, filter) # 调用Weibo类,创建微博实例wb + wb = Weibo(user_id, filter) # 调用Weibo类,创建微博实例wb wb.start() # 爬取微博信息 print(u"用户名: " + wb.username) print(u"全部微博数: " + str(wb.weibo_num)) From 5c264c4b115b6e28ad926d17010a3ce56976da54 Mon Sep 17 00:00:00 2001 From: chenlei Date: Thu, 14 Feb 2019 19:59:29 +0800 Subject: [PATCH 007/363] =?UTF-8?q?fix:=20=E8=A7=A3=E5=86=B3=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E8=8E=B7=E5=8F=96=E5=BE=AE=E5=8D=9A=E5=86=85=E5=AE=B9?= =?UTF-8?q?=E6=9C=80=E5=90=8E=E4=B8=80=E4=B8=AA=E5=AD=97=E7=AC=A6=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 383256a7..a625e9b9 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -20,10 +20,10 @@ def __init__(self, user_id, filter=0): self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 self.username = '' # 用户名,如“Dear-迪丽热巴” self.weibo_num = 0 # 用户全部微博数 - self.weibo_num2 = 0 # 爬取到的微博数 + self.weibo_num2 = 0 # 爬取到的微博数 self.following = 0 # 用户关注数 self.followers = 0 # 用户粉丝数 - self.weibo_content = [] # 微博内容 + self.weibo_content = [] # 微博内容 self.weibo_place = [] # 微博位置 self.publish_time = [] # 微博发布时间 self.up_num = [] # 微博对应的点赞数 @@ -122,7 +122,6 @@ def get_weibo_content(self, info): weibo_content = str_t[0].xpath("string(.)").replace(u"\u200b", "").encode( sys.stdout.encoding, "ignore").decode( sys.stdout.encoding) - weibo_content = weibo_content[:-1] weibo_id = info.xpath("@id")[0][2:] a_link = info.xpath("div/span[@class='ctt']/a") is_retweet = info.xpath("div/span[@class='cmt']") @@ -307,8 +306,8 @@ def write_txt(self): u"微博位置: " + self.weibo_place[i - 1] + "\n" + u"发布时间: " + self.publish_time[i - 1] + "\n" + u"点赞数: " + str(self.up_num[i - 1]) + - u" 转发数: " + str(self.retweet_num[i - 1]) + - u" 评论数: " + str(self.comment_num[i - 1]) + "\n" + + u" 转发数: " + str(self.retweet_num[i - 1]) + + u" 评论数: " + str(self.comment_num[i - 1]) + "\n" + u"发布工具: " + self.publish_tool[i - 1] + "\n\n" ) result = result + text @@ -345,7 +344,7 @@ def main(): # 使用实例,输入一个用户id,所有信息都会存储在wb实例中 user_id = 1476938315 # 可以改成任意合法的用户id(爬虫的微博id除外) filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 - wb = Weibo(user_id, filter) # 调用Weibo类,创建微博实例wb + wb = Weibo(user_id, filter) # 调用Weibo类,创建微博实例wb wb.start() # 爬取微博信息 print(u"用户名: " + wb.username) print(u"全部微博数: " + str(wb.weibo_num)) From f5cfdd4227fae0c616f2a248b11ef65ad7b9d0e7 Mon Sep 17 00:00:00 2001 From: chenlei Date: Thu, 14 Mar 2019 20:24:22 +0800 Subject: [PATCH 008/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E8=BF=9B?= =?UTF-8?q?=E5=BA=A6=E6=9D=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #45 --- weiboSpider.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index a625e9b9..37b307a8 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -9,6 +9,7 @@ from datetime import datetime from datetime import timedelta from lxml import etree +from tqdm import tqdm class Weibo: @@ -20,10 +21,10 @@ def __init__(self, user_id, filter=0): self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 self.username = '' # 用户名,如“Dear-迪丽热巴” self.weibo_num = 0 # 用户全部微博数 - self.weibo_num2 = 0 # 爬取到的微博数 + self.weibo_num2 = 0 # 爬取到的微博数 self.following = 0 # 用户关注数 self.followers = 0 # 用户粉丝数 - self.weibo_content = [] # 微博内容 + self.weibo_content = [] # 微博内容 self.weibo_place = [] # 微博位置 self.publish_time = [] # 微博发布时间 self.up_num = [] # 微博对应的点赞数 @@ -230,7 +231,7 @@ def get_weibo_info(self): page_num = (int)(selector.xpath( "//input[@name='mp']")[0].attrib["value"]) pattern = r"\d+\.?\d*" - for page in range(1, page_num + 1): + for page in tqdm(range(1, page_num + 1), desc=u"进度"): url2 = "https://weibo.cn/u/%d?filter=%d&page=%d" % ( self.user_id, self.filter, page) html2 = requests.get(url2, cookies=self.cookie).content @@ -306,8 +307,8 @@ def write_txt(self): u"微博位置: " + self.weibo_place[i - 1] + "\n" + u"发布时间: " + self.publish_time[i - 1] + "\n" + u"点赞数: " + str(self.up_num[i - 1]) + - u" 转发数: " + str(self.retweet_num[i - 1]) + - u" 评论数: " + str(self.comment_num[i - 1]) + "\n" + + u" 转发数: " + str(self.retweet_num[i - 1]) + + u" 评论数: " + str(self.comment_num[i - 1]) + "\n" + u"发布工具: " + self.publish_tool[i - 1] + "\n\n" ) result = result + text @@ -342,9 +343,9 @@ def start(self): def main(): try: # 使用实例,输入一个用户id,所有信息都会存储在wb实例中 - user_id = 1476938315 # 可以改成任意合法的用户id(爬虫的微博id除外) + user_id = 5019711860 # 可以改成任意合法的用户id(爬虫的微博id除外) filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 - wb = Weibo(user_id, filter) # 调用Weibo类,创建微博实例wb + wb = Weibo(user_id, filter) # 调用Weibo类,创建微博实例wb wb.start() # 爬取微博信息 print(u"用户名: " + wb.username) print(u"全部微博数: " + str(wb.weibo_num)) From 56a66ed3f67863f5ddf95eca39ba42930d13484a Mon Sep 17 00:00:00 2001 From: chenlei Date: Mon, 8 Apr 2019 01:25:56 +0800 Subject: [PATCH 009/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E8=8E=B7=E5=8F=96=E5=85=A8=E9=83=A8=E5=8E=9F=E5=88=9B?= =?UTF-8?q?=E5=BE=AE=E5=8D=9A=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 以前版本通过提取获取用户原创微博。此方法的问题是,微博可能对原创微博显示做了限制,只能提取部分原创微博。 现在的做法是遍历所有微博,因为原创微博和转发微博格式不同,通过判断是否有原创微博格式来确定是否为原创微博,进而提取 Issue #49 --- weiboSpider.py | 71 +++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 37b307a8..6d0a20e2 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -48,8 +48,8 @@ def get_username(self): # 获取用户微博数、关注数、粉丝数 def get_user_info(self): try: - url = "https://weibo.cn/u/%d?filter=%d&page=1" % ( - self.user_id, self.filter) + url = "https://weibo.cn/u/%d?page=1" % ( + self.user_id) html = requests.get(url, cookies=self.cookie).content selector = etree.HTML(html) pattern = r"\d+\.?\d*" @@ -221,8 +221,8 @@ def get_publish_tool(self, info): # 获取用户微博信息 def get_weibo_info(self): try: - url = "https://weibo.cn/u/%d?filter=%d&page=1" % ( - self.user_id, self.filter) + url = "https://weibo.cn/u/%d?page=1" % ( + self.user_id) html = requests.get(url, cookies=self.cookie).content selector = etree.HTML(html) if selector.xpath("//input[@name='mp']") == []: @@ -232,51 +232,52 @@ def get_weibo_info(self): "//input[@name='mp']")[0].attrib["value"]) pattern = r"\d+\.?\d*" for page in tqdm(range(1, page_num + 1), desc=u"进度"): - url2 = "https://weibo.cn/u/%d?filter=%d&page=%d" % ( - self.user_id, self.filter, page) + url2 = "https://weibo.cn/u/%d?page=%d" % (self.user_id, page) html2 = requests.get(url2, cookies=self.cookie).content selector2 = etree.HTML(html2) info = selector2.xpath("//div[@class='c']") is_empty = info[0].xpath("div/span[@class='ctt']") if is_empty: for i in range(0, len(info) - 2): + is_retweet = info[i].xpath("div/span[@class='cmt']") + if (not self.filter) or (not is_retweet): - # 微博内容 - self.get_weibo_content(info[i]) + # 微博内容 + self.get_weibo_content(info[i]) - # 微博位置 - self.get_weibo_place(info[i]) + # 微博位置 + self.get_weibo_place(info[i]) - # 微博发布时间 - self.get_publish_time(info[i]) + # 微博发布时间 + self.get_publish_time(info[i]) - # 微博发布工具 - self.get_publish_tool(info[i]) + # 微博发布工具 + self.get_publish_tool(info[i]) - str_footer = info[i].xpath("div")[-1] - str_footer = str_footer.xpath("string(.)").encode( - sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) - str_footer = str_footer[str_footer.rfind(u'赞'):] - guid = re.findall(pattern, str_footer, re.M) + str_footer = info[i].xpath("div")[-1] + str_footer = str_footer.xpath("string(.)").encode( + sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) + str_footer = str_footer[str_footer.rfind(u'赞'):] + guid = re.findall(pattern, str_footer, re.M) - # 点赞数 - up_num = int(guid[0]) - self.up_num.append(up_num) - print(u"点赞数: " + str(up_num)) + # 点赞数 + up_num = int(guid[0]) + self.up_num.append(up_num) + print(u"点赞数: " + str(up_num)) - # 转发数 - retweet_num = int(guid[1]) - self.retweet_num.append(retweet_num) - print(u"转发数: " + str(retweet_num)) + # 转发数 + retweet_num = int(guid[1]) + self.retweet_num.append(retweet_num) + print(u"转发数: " + str(retweet_num)) - # 评论数 - comment_num = int(guid[2]) - self.comment_num.append(comment_num) - print(u"评论数: " + str(comment_num)) - print( - "===========================================================================") + # 评论数 + comment_num = int(guid[2]) + self.comment_num.append(comment_num) + print(u"评论数: " + str(comment_num)) - self.weibo_num2 += 1 + self.weibo_num2 += 1 + print( + "===========================================================================") if not self.filter: print(u"共" + str(self.weibo_num2) + u"条微博") @@ -343,7 +344,7 @@ def start(self): def main(): try: # 使用实例,输入一个用户id,所有信息都会存储在wb实例中 - user_id = 5019711860 # 可以改成任意合法的用户id(爬虫的微博id除外) + user_id = 1655747731 # 可以改成任意合法的用户id(爬虫的微博id除外) filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 wb = Weibo(user_id, filter) # 调用Weibo类,创建微博实例wb wb.start() # 爬取微博信息 From cce2e786be91152565a9f8c6c499a07c7c99eadd Mon Sep 17 00:00:00 2001 From: chenlei Date: Mon, 15 Apr 2019 20:45:45 +0800 Subject: [PATCH 010/363] =?UTF-8?q?fix:=20=E4=BF=AE=E6=94=B9=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E8=8E=B7=E5=8F=96=E9=83=A8=E5=88=86=E5=BE=AE=E5=8D=9A?= =?UTF-8?q?=E5=85=A8=E6=96=87=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #52 --- weiboSpider.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 6d0a20e2..e384f93f 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -88,9 +88,12 @@ def get_long_weibo(self, weibo_link): html = requests.get(weibo_link, cookies=self.cookie).content selector = etree.HTML(html) info = selector.xpath("//div[@class='c']")[1] - wb_content = info.xpath("div/span[@class='ctt']")[0].xpath( + wb_content = info.xpath( "string(.)").replace(u"\u200b", "").encode(sys.stdout.encoding, "ignore").decode( sys.stdout.encoding) + wb_time = info.xpath("//span[@class='ct']/text()")[0] + wb_content = wb_content[wb_content.find( + ":") + 1:wb_content.rfind(wb_time)] return wb_content except Exception as e: print("Error: ", e) @@ -119,21 +122,18 @@ def get_retweet(self, is_retweet, info, wb_content): # 获取微博内容 def get_weibo_content(self, info): try: - str_t = info.xpath("div/span[@class='ctt']") - weibo_content = str_t[0].xpath("string(.)").replace(u"\u200b", "").encode( + weibo_content = info.xpath("string(.)").replace(u"\u200b", "").encode( sys.stdout.encoding, "ignore").decode( sys.stdout.encoding) + weibo_content = weibo_content[:weibo_content.rfind(u"赞")] weibo_id = info.xpath("@id")[0][2:] - a_link = info.xpath("div/span[@class='ctt']/a") + a_text = info.xpath("//a/text()") is_retweet = info.xpath("div/span[@class='cmt']") - if a_link: - if a_link[-1].xpath("text()")[0] == u"全文": - weibo_link = "https://weibo.cn/comment/" + weibo_id - wb_content = self.get_long_weibo(weibo_link) - if wb_content: - if not is_retweet: - wb_content = wb_content[1:] - weibo_content = wb_content + if u"全文" in a_text: + weibo_link = "https://weibo.cn/comment/" + weibo_id + wb_content = self.get_long_weibo(weibo_link) + if wb_content: + weibo_content = wb_content if is_retweet: weibo_content = self.get_retweet( is_retweet, info, weibo_content) @@ -344,7 +344,7 @@ def start(self): def main(): try: # 使用实例,输入一个用户id,所有信息都会存储在wb实例中 - user_id = 1655747731 # 可以改成任意合法的用户id(爬虫的微博id除外) + user_id = 3937348351 # 可以改成任意合法的用户id(爬虫的微博id除外) filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 wb = Weibo(user_id, filter) # 调用Weibo类,创建微博实例wb wb.start() # 爬取微博信息 From 90bd6c5d8bdc88450d553767008c6a462f8fd06a Mon Sep 17 00:00:00 2001 From: chenlei Date: Tue, 16 Apr 2019 18:28:43 +0800 Subject: [PATCH 011/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E8=AF=86=E5=88=AB=E9=83=A8=E5=88=86=E9=95=BF=E5=BE=AE?= =?UTF-8?q?=E5=8D=9A=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weiboSpider.py b/weiboSpider.py index e384f93f..32025643 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -127,7 +127,7 @@ def get_weibo_content(self, info): sys.stdout.encoding) weibo_content = weibo_content[:weibo_content.rfind(u"赞")] weibo_id = info.xpath("@id")[0][2:] - a_text = info.xpath("//a/text()") + a_text = info.xpath("div//a/text()") is_retweet = info.xpath("div/span[@class='cmt']") if u"全文" in a_text: weibo_link = "https://weibo.cn/comment/" + weibo_id From 6293bf44a6ef0968d8a38173e0342bab92ce331c Mon Sep 17 00:00:00 2001 From: chenlei Date: Wed, 17 Apr 2019 19:47:25 +0800 Subject: [PATCH 012/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E8=8E=B7=E5=8F=96"=E9=95=BF=E8=BD=AC=E5=8F=91?= =?UTF-8?q?=E5=BE=AE=E5=8D=9A"=E5=85=A8=E9=83=A8=E6=96=87=E5=AD=97?= =?UTF-8?q?=E5=86=85=E5=AE=B9=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "长转发微博"是指原始微博或转发理由很长,在微博列表页无法全部显示的微博,需要在该微博评论页面抓取全部信息 --- weiboSpider.py | 71 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 20 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 32025643..e057cabf 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -77,12 +77,11 @@ def get_user_info(self): print(u"粉丝数: " + str(self.followers)) print( "===========================================================================") - except Exception as e: print("Error: ", e) traceback.print_exc() - # 获取"长微博"全部文字内容 + # 获取"长原创微博" def get_long_weibo(self, weibo_link): try: html = requests.get(weibo_link, cookies=self.cookie).content @@ -99,15 +98,57 @@ def get_long_weibo(self, weibo_link): print("Error: ", e) traceback.print_exc() - # 获取转发微博信息 - def get_retweet(self, is_retweet, info, wb_content): + # 获取原创微博 + def get_original_weibo(self, info): try: - original_user = is_retweet[0].xpath("a/text()") + weibo_content = info.xpath("string(.)").replace(u"\u200b", "").encode( + sys.stdout.encoding, "ignore").decode( + sys.stdout.encoding) + weibo_content = weibo_content[:weibo_content.rfind(u"赞")] + a_text = info.xpath("div//a/text()") + if u"全文" in a_text: + weibo_id = info.xpath("@id")[0][2:] + weibo_link = "https://weibo.cn/comment/" + weibo_id + wb_content = self.get_long_weibo(weibo_link) + if wb_content: + weibo_content = wb_content + return weibo_content + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + # 获取"长转发微博" + def get_long_retweet(self, weibo_link): + try: + wb_content = self.get_long_weibo(weibo_link) + wb_content = wb_content[:wb_content.rfind(u"原文转发")] + return wb_content + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + # 获取转发微博 + def get_retweet(self, info): + try: + original_user = info.xpath("div/span[@class='cmt']/a/text()") if not original_user: wb_content = u"转发微博已被删除" return wb_content else: original_user = original_user[0] + wb_content = info.xpath("string(.)").replace(u"\u200b", "").encode( + sys.stdout.encoding, "ignore").decode( + sys.stdout.encoding) + wb_content = wb_content[wb_content.find( + ":") + 1:wb_content.rfind(u"赞")] + wb_content = wb_content[:wb_content.rfind(u"赞")] + a_text = info.xpath("div//a/text()") + if u"全文" in a_text: + weibo_id = info.xpath("@id")[0][2:] + weibo_link = "https://weibo.cn/comment/" + weibo_id + wb_content = self.get_long_retweet(weibo_link) + if wb_content: + weibo_content = wb_content retweet_reason = info.xpath("div")[-1].xpath("string(.)").replace(u"\u200b", "").encode( sys.stdout.encoding, "ignore").decode( sys.stdout.encoding) @@ -122,21 +163,11 @@ def get_retweet(self, is_retweet, info, wb_content): # 获取微博内容 def get_weibo_content(self, info): try: - weibo_content = info.xpath("string(.)").replace(u"\u200b", "").encode( - sys.stdout.encoding, "ignore").decode( - sys.stdout.encoding) - weibo_content = weibo_content[:weibo_content.rfind(u"赞")] - weibo_id = info.xpath("@id")[0][2:] - a_text = info.xpath("div//a/text()") is_retweet = info.xpath("div/span[@class='cmt']") - if u"全文" in a_text: - weibo_link = "https://weibo.cn/comment/" + weibo_id - wb_content = self.get_long_weibo(weibo_link) - if wb_content: - weibo_content = wb_content if is_retweet: - weibo_content = self.get_retweet( - is_retweet, info, weibo_content) + weibo_content = self.get_retweet(info) + else: + weibo_content = self.get_original_weibo(info) self.weibo_content.append(weibo_content) print(weibo_content) except Exception as e: @@ -344,8 +375,8 @@ def start(self): def main(): try: # 使用实例,输入一个用户id,所有信息都会存储在wb实例中 - user_id = 3937348351 # 可以改成任意合法的用户id(爬虫的微博id除外) - filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 + user_id = 2992050891 # 可以改成任意合法的用户id(爬虫的微博id除外) + filter = 0 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 wb = Weibo(user_id, filter) # 调用Weibo类,创建微博实例wb wb.start() # 爬取微博信息 print(u"用户名: " + wb.username) From fd005fa29f6beb7517f1383429c683a44ac8cdd8 Mon Sep 17 00:00:00 2001 From: chenlei Date: Sat, 18 May 2019 18:28:49 +0800 Subject: [PATCH 013/363] =?UTF-8?q?refactor:=20=E4=BB=A3=E7=A0=81=E9=87=8D?= =?UTF-8?q?=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 74 ++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 36 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index e057cabf..1fb1f306 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -32,12 +32,32 @@ def __init__(self, user_id, filter=0): self.comment_num = [] # 微博对应的评论数 self.publish_tool = [] # 微博发布工具 + # 处理html + def deal_html(self, url): + try: + html = requests.get(url, cookies=self.cookie).content + selector = etree.HTML(html) + return selector + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + # 处理乱码 + def deal_garbled(self, info): + try: + info = info.xpath( + "string(.)").replace(u"\u200b", "").encode(sys.stdout.encoding, "ignore").decode( + sys.stdout.encoding) + return info + except Exception as e: + print("Error: ", e) + traceback.print_exc() + # 获取用户昵称 def get_username(self): try: url = "https://weibo.cn/%d/info" % (self.user_id) - html = requests.get(url, cookies=self.cookie).content - selector = etree.HTML(html) + selector = self.deal_html(url) username = selector.xpath("//title/text()")[0] self.username = username[:-3] print(u"用户名: " + self.username) @@ -48,10 +68,8 @@ def get_username(self): # 获取用户微博数、关注数、粉丝数 def get_user_info(self): try: - url = "https://weibo.cn/u/%d?page=1" % ( - self.user_id) - html = requests.get(url, cookies=self.cookie).content - selector = etree.HTML(html) + url = "https://weibo.cn/u/%d?page=1" % (self.user_id) + selector = self.deal_html(url) pattern = r"\d+\.?\d*" # 微博数 @@ -84,12 +102,9 @@ def get_user_info(self): # 获取"长原创微博" def get_long_weibo(self, weibo_link): try: - html = requests.get(weibo_link, cookies=self.cookie).content - selector = etree.HTML(html) + selector = self.deal_html(weibo_link) info = selector.xpath("//div[@class='c']")[1] - wb_content = info.xpath( - "string(.)").replace(u"\u200b", "").encode(sys.stdout.encoding, "ignore").decode( - sys.stdout.encoding) + wb_content = self.deal_garbled(info) wb_time = info.xpath("//span[@class='ct']/text()")[0] wb_content = wb_content[wb_content.find( ":") + 1:wb_content.rfind(wb_time)] @@ -101,9 +116,7 @@ def get_long_weibo(self, weibo_link): # 获取原创微博 def get_original_weibo(self, info): try: - weibo_content = info.xpath("string(.)").replace(u"\u200b", "").encode( - sys.stdout.encoding, "ignore").decode( - sys.stdout.encoding) + weibo_content = self.deal_garbled(info) weibo_content = weibo_content[:weibo_content.rfind(u"赞")] a_text = info.xpath("div//a/text()") if u"全文" in a_text: @@ -136,9 +149,7 @@ def get_retweet(self, info): return wb_content else: original_user = original_user[0] - wb_content = info.xpath("string(.)").replace(u"\u200b", "").encode( - sys.stdout.encoding, "ignore").decode( - sys.stdout.encoding) + wb_content = self.deal_garbled(info) wb_content = wb_content[wb_content.find( ":") + 1:wb_content.rfind(u"赞")] wb_content = wb_content[:wb_content.rfind(u"赞")] @@ -149,9 +160,7 @@ def get_retweet(self, info): wb_content = self.get_long_retweet(weibo_link) if wb_content: weibo_content = wb_content - retweet_reason = info.xpath("div")[-1].xpath("string(.)").replace(u"\u200b", "").encode( - sys.stdout.encoding, "ignore").decode( - sys.stdout.encoding) + retweet_reason = self.deal_garbled(info.xpath("div")[-1]) retweet_reason = retweet_reason[:retweet_reason.rindex(u"赞")] wb_content = (retweet_reason + "\n" + u"原始用户: " + original_user + "\n" + u"转发内容: " + wb_content) @@ -191,8 +200,7 @@ def get_weibo_place(self, info): weibo_place = weibo_a[-2] else: weibo_place = u"无" - weibo_place = weibo_place.xpath("string(.)").encode( - sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) + weibo_place = self.deal_garbled(weibo_place) break self.weibo_place.append(weibo_place) print(u"微博位置: " + weibo_place) @@ -204,8 +212,7 @@ def get_weibo_place(self, info): def get_publish_time(self, info): try: str_time = info.xpath("div/span[@class='ct']") - str_time = str_time[0].xpath("string(.)").encode( - sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) + str_time = self.deal_garbled(str_time[0]) publish_time = str_time.split(u'来自')[0] if u"刚刚" in publish_time: publish_time = datetime.now().strftime( @@ -237,8 +244,7 @@ def get_publish_time(self, info): def get_publish_tool(self, info): try: str_time = info.xpath("div/span[@class='ct']") - str_time = str_time[0].xpath("string(.)").encode( - sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) + str_time = self.deal_garbled(str_time[0]) if len(str_time.split(u'来自')) > 1: publish_tool = str_time.split(u'来自')[1] else: @@ -252,10 +258,8 @@ def get_publish_tool(self, info): # 获取用户微博信息 def get_weibo_info(self): try: - url = "https://weibo.cn/u/%d?page=1" % ( - self.user_id) - html = requests.get(url, cookies=self.cookie).content - selector = etree.HTML(html) + url = "https://weibo.cn/u/%d?page=1" % (self.user_id) + selector = self.deal_html(url) if selector.xpath("//input[@name='mp']") == []: page_num = 1 else: @@ -264,8 +268,7 @@ def get_weibo_info(self): pattern = r"\d+\.?\d*" for page in tqdm(range(1, page_num + 1), desc=u"进度"): url2 = "https://weibo.cn/u/%d?page=%d" % (self.user_id, page) - html2 = requests.get(url2, cookies=self.cookie).content - selector2 = etree.HTML(html2) + selector2 = self.deal_html(url2) info = selector2.xpath("//div[@class='c']") is_empty = info[0].xpath("div/span[@class='ctt']") if is_empty: @@ -286,8 +289,7 @@ def get_weibo_info(self): self.get_publish_tool(info[i]) str_footer = info[i].xpath("div")[-1] - str_footer = str_footer.xpath("string(.)").encode( - sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) + str_footer = self.deal_garbled(str_footer) str_footer = str_footer[str_footer.rfind(u'赞'):] guid = re.findall(pattern, str_footer, re.M) @@ -375,8 +377,8 @@ def start(self): def main(): try: # 使用实例,输入一个用户id,所有信息都会存储在wb实例中 - user_id = 2992050891 # 可以改成任意合法的用户id(爬虫的微博id除外) - filter = 0 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 + user_id = 1711243680 # 可以改成任意合法的用户id(爬虫的微博id除外) + filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 wb = Weibo(user_id, filter) # 调用Weibo类,创建微博实例wb wb.start() # 爬取微博信息 print(u"用户名: " + wb.username) From 5c79cf9b9e59fe022bef8c64998c3b301e6e6c9c Mon Sep 17 00:00:00 2001 From: chenlei Date: Sun, 19 May 2019 20:36:02 +0800 Subject: [PATCH 014/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96txt=E7=BB=93?= =?UTF-8?q?=E6=9E=9C=E6=96=87=E4=BB=B6=E5=86=99=E5=85=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 1fb1f306..05cf5250 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -329,23 +329,25 @@ def write_txt(self): result_header = u"\n\n原创微博内容: \n" else: result_header = u"\n\n微博内容: \n" - result = (u"用户信息\n用户昵称:" + self.username + - u"\n用户id: " + str(self.user_id) + - u"\n微博数: " + str(self.weibo_num) + - u"\n关注数: " + str(self.following) + - u"\n粉丝数: " + str(self.followers) + - result_header - ) + temp_result = [] + temp_result.append(u"用户信息\n用户昵称:" + self.username + + u"\n用户id: " + str(self.user_id) + + u"\n微博数: " + str(self.weibo_num) + + u"\n关注数: " + str(self.following) + + u"\n粉丝数: " + str(self.followers) + + result_header + ) for i in range(1, self.weibo_num2 + 1): - text = (str(i) + ":" + self.weibo_content[i - 1] + "\n" + - u"微博位置: " + self.weibo_place[i - 1] + "\n" + - u"发布时间: " + self.publish_time[i - 1] + "\n" + - u"点赞数: " + str(self.up_num[i - 1]) + - u" 转发数: " + str(self.retweet_num[i - 1]) + - u" 评论数: " + str(self.comment_num[i - 1]) + "\n" + - u"发布工具: " + self.publish_tool[i - 1] + "\n\n" - ) - result = result + text + temp_result.append(str(i) + ":" + self.weibo_content[i - 1] + "\n" + + u"微博位置: " + self.weibo_place[i - 1] + "\n" + + u"发布时间: " + self.publish_time[i - 1] + "\n" + + u"点赞数: " + str(self.up_num[i - 1]) + + u" 转发数: " + str(self.retweet_num[i - 1]) + + u" 评论数: " + str(self.comment_num[i - 1]) + "\n" + + u"发布工具: " + + self.publish_tool[i - 1] + "\n\n" + ) + result = ''.join(temp_result) file_dir = os.path.split(os.path.realpath(__file__))[ 0] + os.sep + "weibo" if not os.path.isdir(file_dir): From 42c8955b190170136902095925e5858237a502e3 Mon Sep 17 00:00:00 2001 From: chenlei Date: Sun, 19 May 2019 21:04:05 +0800 Subject: [PATCH 015/363] =?UTF-8?q?style:=20=E4=BF=AE=E6=94=B9=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 05cf5250..cd55104d 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -15,8 +15,8 @@ class Weibo: cookie = {"Cookie": "your cookie"} # 将your cookie替换成自己的cookie - # Weibo类初始化 def __init__(self, user_id, filter=0): + """Weibo类初始化""" self.user_id = user_id # 用户id,即需要我们输入的数字,如昵称为“Dear-迪丽热巴”的id为1669879400 self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 self.username = '' # 用户名,如“Dear-迪丽热巴” @@ -32,8 +32,8 @@ def __init__(self, user_id, filter=0): self.comment_num = [] # 微博对应的评论数 self.publish_tool = [] # 微博发布工具 - # 处理html def deal_html(self, url): + """处理html""" try: html = requests.get(url, cookies=self.cookie).content selector = etree.HTML(html) @@ -42,8 +42,8 @@ def deal_html(self, url): print("Error: ", e) traceback.print_exc() - # 处理乱码 def deal_garbled(self, info): + """处理乱码""" try: info = info.xpath( "string(.)").replace(u"\u200b", "").encode(sys.stdout.encoding, "ignore").decode( @@ -53,8 +53,8 @@ def deal_garbled(self, info): print("Error: ", e) traceback.print_exc() - # 获取用户昵称 def get_username(self): + """获取用户昵称""" try: url = "https://weibo.cn/%d/info" % (self.user_id) selector = self.deal_html(url) @@ -65,8 +65,8 @@ def get_username(self): print("Error: ", e) traceback.print_exc() - # 获取用户微博数、关注数、粉丝数 def get_user_info(self): + """获取用户微博数、关注数、粉丝数""" try: url = "https://weibo.cn/u/%d?page=1" % (self.user_id) selector = self.deal_html(url) @@ -99,8 +99,8 @@ def get_user_info(self): print("Error: ", e) traceback.print_exc() - # 获取"长原创微博" def get_long_weibo(self, weibo_link): + """获取长原创微博""" try: selector = self.deal_html(weibo_link) info = selector.xpath("//div[@class='c']")[1] @@ -113,8 +113,8 @@ def get_long_weibo(self, weibo_link): print("Error: ", e) traceback.print_exc() - # 获取原创微博 def get_original_weibo(self, info): + """获取原创微博""" try: weibo_content = self.deal_garbled(info) weibo_content = weibo_content[:weibo_content.rfind(u"赞")] @@ -130,8 +130,8 @@ def get_original_weibo(self, info): print("Error: ", e) traceback.print_exc() - # 获取"长转发微博" def get_long_retweet(self, weibo_link): + """获取长转发微博""" try: wb_content = self.get_long_weibo(weibo_link) wb_content = wb_content[:wb_content.rfind(u"原文转发")] @@ -140,8 +140,8 @@ def get_long_retweet(self, weibo_link): print("Error: ", e) traceback.print_exc() - # 获取转发微博 def get_retweet(self, info): + """获取转发微博""" try: original_user = info.xpath("div/span[@class='cmt']/a/text()") if not original_user: @@ -169,8 +169,8 @@ def get_retweet(self, info): print("Error: ", e) traceback.print_exc() - # 获取微博内容 def get_weibo_content(self, info): + """获取微博内容""" try: is_retweet = info.xpath("div/span[@class='cmt']") if is_retweet: @@ -183,8 +183,8 @@ def get_weibo_content(self, info): print("Error: ", e) traceback.print_exc() - # 获取微博发布位置 def get_weibo_place(self, info): + """获取微博发布位置""" try: div_first = info.xpath("div")[0] a_list = div_first.xpath("a") @@ -208,8 +208,8 @@ def get_weibo_place(self, info): print("Error: ", e) traceback.print_exc() - # 获取微博发布时间 def get_publish_time(self, info): + """获取微博发布时间""" try: str_time = info.xpath("div/span[@class='ct']") str_time = self.deal_garbled(str_time[0]) @@ -240,8 +240,8 @@ def get_publish_time(self, info): print("Error: ", e) traceback.print_exc() - # 获取微博发布工具 def get_publish_tool(self, info): + """获取微博发布工具""" try: str_time = info.xpath("div/span[@class='ct']") str_time = self.deal_garbled(str_time[0]) @@ -255,8 +255,8 @@ def get_publish_tool(self, info): print("Error: ", e) traceback.print_exc() - # 获取用户微博信息 def get_weibo_info(self): + """获取用户微博信息""" try: url = "https://weibo.cn/u/%d?page=1" % (self.user_id) selector = self.deal_html(url) @@ -322,8 +322,8 @@ def get_weibo_info(self): print("Error: ", e) traceback.print_exc() - # 将爬取的信息写入文件 def write_txt(self): + """将爬取的信息写入文件""" try: if self.filter: result_header = u"\n\n原创微博内容: \n" @@ -362,8 +362,8 @@ def write_txt(self): print("Error: ", e) traceback.print_exc() - # 运行爬虫 def start(self): + """运行爬虫""" try: self.get_username() self.get_user_info() From ae695301805c441d2ba7ee2a5dce54ac4541620b Mon Sep 17 00:00:00 2001 From: chenlei Date: Thu, 23 May 2019 21:42:41 +0800 Subject: [PATCH 016/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E7=88=AC?= =?UTF-8?q?=E5=8F=96=E7=BB=93=E6=9E=9C=E5=86=99=E5=85=A5csv=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 59 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index cd55104d..5a067588 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -1,6 +1,8 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- +import codecs +import csv import os import re import requests @@ -322,8 +324,21 @@ def get_weibo_info(self): print("Error: ", e) traceback.print_exc() + def get_filepath(self, type): + """获取结果文件路径""" + try: + file_dir = os.path.split(os.path.realpath(__file__))[ + 0] + os.sep + "weibo" + if not os.path.isdir(file_dir): + os.mkdir(file_dir) + file_path = file_dir + os.sep + "%d" % self.user_id + "." + type + return file_path + except Exception as e: + print("Error: ", e) + traceback.print_exc() + def write_txt(self): - """将爬取的信息写入文件""" + """将爬取的信息写入txt文件""" try: if self.filter: result_header = u"\n\n原创微博内容: \n" @@ -348,16 +363,36 @@ def write_txt(self): self.publish_tool[i - 1] + "\n\n" ) result = ''.join(temp_result) - file_dir = os.path.split(os.path.realpath(__file__))[ - 0] + os.sep + "weibo" - if not os.path.isdir(file_dir): - os.mkdir(file_dir) - file_path = file_dir + os.sep + "%d" % self.user_id + ".txt" - f = open(file_path, "wb") - f.write(result.encode(sys.stdout.encoding)) - f.close() - print(u"微博写入文件完毕,保存路径:") - print(file_path) + with open(self.get_filepath("txt"), "wb") as f: + f.write(result.encode(sys.stdout.encoding)) + print(u"微博写入txt文件完毕,保存路径:") + print(self.get_filepath("txt")) + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def write_csv(self): + """将爬取的信息写入csv文件""" + try: + result_headers = ["微博正文", "发布位置", + "发布时间", "发布工具", "点赞数", "转发数", "评论数"] + result_data = zip(self.weibo_content, self.weibo_place, self.publish_time, + self.publish_tool, self.up_num, self.retweet_num, self.comment_num) + if sys.version < '3': # python2.x + reload(sys) + sys.setdefaultencoding('utf-8') + with open(self.get_filepath("csv"), "wb") as f: + f.write(codecs.BOM_UTF8) + writer = csv.writer(f) + writer.writerows([result_headers]) + writer.writerows(result_data) + else: # python3.x + with open(self.get_filepath("csv"), "w", encoding="utf-8-sig", newline="") as f: + writer = csv.writer(f) + writer.writerows([result_headers]) + writer.writerows(result_data) + print(u"微博写入csv文件完毕,保存路径:") + print(self.get_filepath("csv")) except Exception as e: print("Error: ", e) traceback.print_exc() @@ -369,11 +404,13 @@ def start(self): self.get_user_info() self.get_weibo_info() self.write_txt() + self.write_csv() print(u"信息抓取完毕") print( "===========================================================================") except Exception as e: print("Error: ", e) + traceback.print_exc() def main(): From a74a136cefab509572e375f0a68f004245832f02 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Fri, 24 May 2019 19:02:28 +0800 Subject: [PATCH 017/363] Update README.md --- README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 2e6245e8..c90d5067 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,11 @@ # 功能 -爬取新浪微博信息:爬取微博信息,并写入文件,文件结果如图所示: -![](https://picture.cognize.me/cognize/github/weibospider/weibotxt.png) +爬取新浪微博信息,并写入csv/txt文件,文件名为目标用户id加".csv"或".txt"的形式。
+
+csv文件结果如下所示: +![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv* + +txt文件结果如下所示: +![](https://picture.cognize.me/cognize/github/weibospider/weibotxt.png)*1669879400.txt* # 输入 用户id,例如新浪微博昵称为“Dear-迪丽热巴”的id为“1669879400” @@ -17,7 +22,7 @@ - 微博对应的转发数:以list的形式存储了用户所有微博对应的转发数 - 微博对应的评论数:以list的形式存储了用户所有微博对应的评论数 - 微博发布工具:以list的形式存储了用户所有微博的发布工具,如iPhone客户端、HUAWEI Mate 20 Pro等 -- 结果文件:保存在当前目录的weibo文件夹里,名字为"user_id.txt"的形式 +- 结果文件:保存在当前目录的weibo文件夹里,名字为"user_id.csv"和"user_id.txt"的形式 # 运行环境 - 开发语言:python2/python3 From b5d2a1aa40902a660ac81655e40efd9d0fd54fde Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Fri, 24 May 2019 21:28:24 +0800 Subject: [PATCH 018/363] Update README.md --- README.md | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index c90d5067..7c2ae6b4 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # 功能 -爬取新浪微博信息,并写入csv/txt文件,文件名为目标用户id加".csv"或".txt"的形式。
+爬取新浪微博信息,并写入csv/txt文件,文件名为目标用户id加".csv"和".txt"的形式。

csv文件结果如下所示: ![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv* @@ -61,10 +61,9 @@ user_id可以改成任意合法的用户id(爬虫的微博id除外);filter **wb.publish_time**: 存储微博的发布时间,为list形式,如wb.publish_time[0]为最新一条微博的发布时间,与wb.weibo_content[0]对应,其它用法同wb.weibo_content;
**wb.up_num**:存储微博获得的点赞数,为list形式,如wb.up_num[0]为最新一条微博获得的点赞数,与wb.weibo_content[0]对应,其它用法同wb.weibo_content;
**wb.retweet_num**:存储微博获得的转发数,为list形式,如wb.retweet_num[0]为最新一条微博获得的转发数,与wb.weibo_content[0]对应,其它用法同wb.weibo_content;
-**wb.comment_num**:存储微博获得的评论数,为list形式,如wb.comment_num[0]为最新一条微博获得的评论数,与wb.weibo_content[0]对应,其它用法同wb.weibo_content。
+**wb.comment_num**:存储微博获得的评论数,为list形式,如wb.comment_num[0]为最新一条微博获得的评论数,与wb.weibo_content[0]对应,其它用法同wb.weibo_content;
**wb.publish_tool**:存储微博的发布工具,为list形式,如wb.publish_tool[0]为最新一条微博的发布工具,与wb.weibo_content[0]对应,其它用法同wb.weibo_content。 - # 如何获取cookie 1.用Chrome打开
2.输入微博的用户名、密码,登录,如图所示: @@ -76,14 +75,13 @@ user_id可以改成任意合法的用户id(爬虫的微博id除外);filter ![](https://picture.cognize.me/cognize/github/weibospider/cookie3.png) # 如何获取user_id -1.打开网址,搜索我们要找的人,如”郭碧婷“,进入她的主页;
-2.大部分情况下,在用户主页的地址栏里就包含了user_id,如”郭碧婷“的地址栏地址为"",其中的"1729370543"就是她的user_id。如图所示: -![](https://picture.cognize.me/cognize/github/weibospider/userid1.png) -但是部分用户设置了个性域名,他们的地址栏地址就变成了""的形式,如柳岩主页的地址栏地址为""。如图所示: -![](https://picture.cognize.me/cognize/github/weibospider/userid2.png) -事实上,如果仅仅爬取微博,用user_id或个性域名都可以,但是因为本脚本还要爬取用户昵称,而用个性域名表示的网页爬取有一些小问题,需要另外的网页。所以,如果遇到地址栏没有user_id的情况,大家可以点击”资料“,跳转到用户资料页面,如柳岩的资料页面地址为"",其中的"1644461042"即为柳岩微博的user_id。如图所示: -![](https://picture.cognize.me/cognize/github/weibospider/userid3.png) +1.打开网址,搜索我们要找的人,如”迪丽热巴“,进入她的主页;
+![](https://picture.cognize.me/cognize/github/weibospider/user_home.png) +2.按照上图箭头所指,点击"资料"链接,跳转到用户资料页面;
+![](https://picture.cognize.me/cognize/github/weibospider/user_info.png) +如上图所示,迪丽热巴微博资料页的地址为"",其中的"1669879400"即为此微博的user_id。
+事实上,此微博的user_id也包含在用户主页()中,之所以我们还要点击主页中的"资料"来获取user_id,是因为很多用户的主页不是""的形式,而是""或""的形式。其中"微号"和user_id都是一串数字,如果仅仅通过主页地址提取user_id,很容易将"微号"误认为user_id。 # 注意事项 1.user_id不能为爬虫微博的user_id。因为要爬微博信息,必须先登录到某个微博账号,此账号我们姑且称为爬虫微博。爬虫微博访问自己的页面和访问其他用户的页面,得到的网页格式不同,所以无法爬取自己的微博信息;
-2.cookie有期限限制,大约有几天的有效期,超过有效期需重新更新cookie。 +2.cookie有期限限制,超过有效期需重新更新cookie。 From 15c09f3ac6da0c772ebb923cfa1573ab8e4f39cb Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Fri, 24 May 2019 21:50:57 +0800 Subject: [PATCH 019/363] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7c2ae6b4..620ca169 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ csv文件结果如下所示: ![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv* txt文件结果如下所示: -![](https://picture.cognize.me/cognize/github/weibospider/weibotxt.png)*1669879400.txt* +![](https://picture.cognize.me/cognize/github/weibospider/weibo_txt.png)*1669879400.txt* # 输入 用户id,例如新浪微博昵称为“Dear-迪丽热巴”的id为“1669879400” From 96f141157c4daaa49324b00ae40f0f78a2eaf142 Mon Sep 17 00:00:00 2001 From: chenlei Date: Mon, 27 May 2019 01:47:00 +0800 Subject: [PATCH 020/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E7=94=A8?= =?UTF-8?q?=E6=88=B7=E4=BF=A1=E6=81=AF=E8=8E=B7=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 5a067588..0221d012 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -70,30 +70,20 @@ def get_username(self): def get_user_info(self): """获取用户微博数、关注数、粉丝数""" try: - url = "https://weibo.cn/u/%d?page=1" % (self.user_id) + url = "https://weibo.cn/u/%d" % (self.user_id) selector = self.deal_html(url) - pattern = r"\d+\.?\d*" + weibo_footer = selector.xpath("//div[@class='tip2']/*/text()") # 微博数 - str_wb = selector.xpath( - "//div[@class='tip2']/span[@class='tc']/text()")[0] - guid = re.findall(pattern, str_wb, re.S | re.M) - for value in guid: - num_wb = int(value) - break - self.weibo_num = num_wb + self.weibo_num = int(weibo_footer[0][3:-1]) print(u"微博数: " + str(self.weibo_num)) # 关注数 - str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0] - guid = re.findall(pattern, str_gz, re.M) - self.following = int(guid[0]) + self.following = int(weibo_footer[1][3:-1]) print(u"关注数: " + str(self.following)) # 粉丝数 - str_fs = selector.xpath("//div[@class='tip2']/a/text()")[1] - guid = re.findall(pattern, str_fs, re.M) - self.followers = int(guid[0]) + self.followers = int(weibo_footer[2][3:-1]) print(u"粉丝数: " + str(self.followers)) print( "===========================================================================") @@ -416,7 +406,7 @@ def start(self): def main(): try: # 使用实例,输入一个用户id,所有信息都会存储在wb实例中 - user_id = 1711243680 # 可以改成任意合法的用户id(爬虫的微博id除外) + user_id = 1669879400 # 可以改成任意合法的用户id(爬虫的微博id除外) filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 wb = Weibo(user_id, filter) # 调用Weibo类,创建微博实例wb wb.start() # 爬取微博信息 From 904021281407356ac08d6ccad61f23aaf39f2a34 Mon Sep 17 00:00:00 2001 From: chenlei Date: Tue, 28 May 2019 20:43:35 +0800 Subject: [PATCH 021/363] =?UTF-8?q?refactor:=20=E4=BB=A3=E7=A0=81=E9=87=8D?= =?UTF-8?q?=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 167 +++++++++++++++++++++++++------------------------ 1 file changed, 84 insertions(+), 83 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 0221d012..eefc100e 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -21,9 +21,9 @@ def __init__(self, user_id, filter=0): """Weibo类初始化""" self.user_id = user_id # 用户id,即需要我们输入的数字,如昵称为“Dear-迪丽热巴”的id为1669879400 self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 - self.username = '' # 用户名,如“Dear-迪丽热巴” + self.nickname = '' # 用户昵称,如“Dear-迪丽热巴” self.weibo_num = 0 # 用户全部微博数 - self.weibo_num2 = 0 # 爬取到的微博数 + self.got_num = 0 # 爬取到的微博数 self.following = 0 # 用户关注数 self.followers = 0 # 用户粉丝数 self.weibo_content = [] # 微博内容 @@ -55,38 +55,46 @@ def deal_garbled(self, info): print("Error: ", e) traceback.print_exc() - def get_username(self): + def get_nickname(self): """获取用户昵称""" try: url = "https://weibo.cn/%d/info" % (self.user_id) selector = self.deal_html(url) - username = selector.xpath("//title/text()")[0] - self.username = username[:-3] - print(u"用户名: " + self.username) + nickname = selector.xpath("//title/text()")[0] + self.nickname = nickname[:-3] + print(u"用户昵称: " + self.nickname) except Exception as e: print("Error: ", e) traceback.print_exc() - def get_user_info(self): - """获取用户微博数、关注数、粉丝数""" + def get_user_info(self, selector): + """获取用户昵称、微博数、关注数、粉丝数""" try: - url = "https://weibo.cn/u/%d" % (self.user_id) - selector = self.deal_html(url) - weibo_footer = selector.xpath("//div[@class='tip2']/*/text()") + self.get_nickname() # 获取用户昵称 + user_info = selector.xpath("//div[@class='tip2']/*/text()") - # 微博数 - self.weibo_num = int(weibo_footer[0][3:-1]) + self.weibo_num = int(user_info[0][3:-1]) print(u"微博数: " + str(self.weibo_num)) - # 关注数 - self.following = int(weibo_footer[1][3:-1]) + self.following = int(user_info[1][3:-1]) print(u"关注数: " + str(self.following)) - # 粉丝数 - self.followers = int(weibo_footer[2][3:-1]) + self.followers = int(user_info[2][3:-1]) print(u"粉丝数: " + str(self.followers)) - print( - "===========================================================================") + print("*" * 100) + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def get_page_num(self, selector): + """获取微博总页数""" + try: + if selector.xpath("//input[@name='mp']") == []: + page_num = 1 + else: + page_num = (int)(selector.xpath( + "//input[@name='mp']")[0].attrib["value"]) + return page_num except Exception as e: print("Error: ", e) traceback.print_exc() @@ -247,69 +255,65 @@ def get_publish_tool(self, info): print("Error: ", e) traceback.print_exc() + def get_weibo_footer(self, info): + """获取微博点赞数、转发数、评论数""" + try: + pattern = r"\d+" + str_footer = info.xpath("div")[-1] + str_footer = self.deal_garbled(str_footer) + str_footer = str_footer[str_footer.rfind(u'赞'):] + weibo_footer = re.findall(pattern, str_footer, re.M) + + up_num = int(weibo_footer[0]) + self.up_num.append(up_num) + print(u"点赞数: " + str(up_num)) + + retweet_num = int(weibo_footer[1]) + self.retweet_num.append(retweet_num) + print(u"转发数: " + str(retweet_num)) + + comment_num = int(weibo_footer[2]) + self.comment_num.append(comment_num) + print(u"评论数: " + str(comment_num)) + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def get_one_page(self, page): + """获取第page页的全部微博""" + try: + url = "https://weibo.cn/u/%d?page=%d" % (self.user_id, page) + selector = self.deal_html(url) + info = selector.xpath("//div[@class='c']") + is_empty = info[0].xpath("div/span[@class='ctt']") + if is_empty: + for i in range(0, len(info) - 2): + is_retweet = info[i].xpath("div/span[@class='cmt']") + if (not self.filter) or (not is_retweet): + self.get_weibo_content(info[i]) # 微博内容 + self.get_weibo_place(info[i]) # 微博位置 + self.get_publish_time(info[i]) # 微博发布时间 + self.get_publish_tool(info[i]) # 微博发布工具 + self.get_weibo_footer(info[i]) # 微博点赞数、转发数、评论数 + self.got_num += 1 + print("-" * 100) + except Exception as e: + print("Error: ", e) + traceback.print_exc() + def get_weibo_info(self): - """获取用户微博信息""" + """获取微博信息""" try: - url = "https://weibo.cn/u/%d?page=1" % (self.user_id) + url = "https://weibo.cn/u/%d" % (self.user_id) selector = self.deal_html(url) - if selector.xpath("//input[@name='mp']") == []: - page_num = 1 - else: - page_num = (int)(selector.xpath( - "//input[@name='mp']")[0].attrib["value"]) - pattern = r"\d+\.?\d*" + self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 + page_num = self.get_page_num(selector) # 获取微博总页数 for page in tqdm(range(1, page_num + 1), desc=u"进度"): - url2 = "https://weibo.cn/u/%d?page=%d" % (self.user_id, page) - selector2 = self.deal_html(url2) - info = selector2.xpath("//div[@class='c']") - is_empty = info[0].xpath("div/span[@class='ctt']") - if is_empty: - for i in range(0, len(info) - 2): - is_retweet = info[i].xpath("div/span[@class='cmt']") - if (not self.filter) or (not is_retweet): - - # 微博内容 - self.get_weibo_content(info[i]) - - # 微博位置 - self.get_weibo_place(info[i]) - - # 微博发布时间 - self.get_publish_time(info[i]) - - # 微博发布工具 - self.get_publish_tool(info[i]) - - str_footer = info[i].xpath("div")[-1] - str_footer = self.deal_garbled(str_footer) - str_footer = str_footer[str_footer.rfind(u'赞'):] - guid = re.findall(pattern, str_footer, re.M) - - # 点赞数 - up_num = int(guid[0]) - self.up_num.append(up_num) - print(u"点赞数: " + str(up_num)) - - # 转发数 - retweet_num = int(guid[1]) - self.retweet_num.append(retweet_num) - print(u"转发数: " + str(retweet_num)) - - # 评论数 - comment_num = int(guid[2]) - self.comment_num.append(comment_num) - print(u"评论数: " + str(comment_num)) - - self.weibo_num2 += 1 - print( - "===========================================================================") - + self.get_one_page(page) # 获取第page页的全部微博 if not self.filter: - print(u"共" + str(self.weibo_num2) + u"条微博") + print(u"共爬取" + str(self.got_num) + u"条微博") else: - print(u"共" + str(self.weibo_num) + u"条微博,其中" + - str(self.weibo_num2) + u"条为原创微博" - ) + print(u"共爬取" + str(self.got_num) + u"条原创微博") except Exception as e: print("Error: ", e) traceback.print_exc() @@ -335,14 +339,14 @@ def write_txt(self): else: result_header = u"\n\n微博内容: \n" temp_result = [] - temp_result.append(u"用户信息\n用户昵称:" + self.username + + temp_result.append(u"用户信息\n用户昵称:" + self.nickname + u"\n用户id: " + str(self.user_id) + u"\n微博数: " + str(self.weibo_num) + u"\n关注数: " + str(self.following) + u"\n粉丝数: " + str(self.followers) + result_header ) - for i in range(1, self.weibo_num2 + 1): + for i in range(1, self.got_num + 1): temp_result.append(str(i) + ":" + self.weibo_content[i - 1] + "\n" + u"微博位置: " + self.weibo_place[i - 1] + "\n" + u"发布时间: " + self.publish_time[i - 1] + "\n" + @@ -390,14 +394,11 @@ def write_csv(self): def start(self): """运行爬虫""" try: - self.get_username() - self.get_user_info() self.get_weibo_info() self.write_txt() self.write_csv() print(u"信息抓取完毕") - print( - "===========================================================================") + print("*" * 100) except Exception as e: print("Error: ", e) traceback.print_exc() @@ -410,7 +411,7 @@ def main(): filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 wb = Weibo(user_id, filter) # 调用Weibo类,创建微博实例wb wb.start() # 爬取微博信息 - print(u"用户名: " + wb.username) + print(u"用户昵称: " + wb.nickname) print(u"全部微博数: " + str(wb.weibo_num)) print(u"关注数: " + str(wb.following)) print(u"粉丝数: " + str(wb.followers)) From 598206c7d716ec098b57cf1a6311502ad09d40b6 Mon Sep 17 00:00:00 2001 From: chenlei Date: Tue, 28 May 2019 22:25:01 +0800 Subject: [PATCH 022/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E7=88=AC=E5=8F=96=E9=83=A8=E5=88=86=E5=BE=AE=E5=8D=9A?= =?UTF-8?q?=E5=8F=91=E5=B8=83=E5=9C=B0=E5=9D=80=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weiboSpider.py b/weiboSpider.py index eefc100e..a702d74c 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -195,7 +195,7 @@ def get_weibo_place(self, info): weibo_a = div_first.xpath("span[@class='ctt']/a") if len(weibo_a) >= 1: weibo_place = weibo_a[-1] - if u"的秒拍视频" in div_first.xpath("span[@class='ctt']/a/text()")[-1]: + if u"视频" == div_first.xpath("span[@class='ctt']/a/text()")[-1][-2:]: if len(weibo_a) >= 2: weibo_place = weibo_a[-2] else: From e6d8ca42ba23a015b2fe4f4073306a30798ea4f6 Mon Sep 17 00:00:00 2001 From: chenlei Date: Wed, 29 May 2019 19:46:48 +0800 Subject: [PATCH 023/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E9=9A=8F?= =?UTF-8?q?=E6=9C=BA=E7=AD=89=E5=BE=85=EF=BC=8C=E5=87=8F=E4=BD=8E=E8=A2=AB?= =?UTF-8?q?=E9=99=90=E5=88=B6=E9=A3=8E=E9=99=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/weiboSpider.py b/weiboSpider.py index a702d74c..09d2b342 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -4,6 +4,7 @@ import codecs import csv import os +import random import re import requests import sys @@ -11,6 +12,7 @@ from datetime import datetime from datetime import timedelta from lxml import etree +from time import sleep from tqdm import tqdm @@ -308,8 +310,19 @@ def get_weibo_info(self): selector = self.deal_html(url) self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 page_num = self.get_page_num(selector) # 获取微博总页数 + page1 = 0 + random_pages = random.randint(1, 5) for page in tqdm(range(1, page_num + 1), desc=u"进度"): self.get_one_page(page) # 获取第page页的全部微博 + + # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限 + # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默 + # 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间 + if page - page1 == random_pages: + sleep(random.randint(6, 10)) + page1 = page + random_pages = random.randint(1, 5) + if not self.filter: print(u"共爬取" + str(self.got_num) + u"条微博") else: From d82e3ed9490a21bdc8aeca9af766a1da110d2698 Mon Sep 17 00:00:00 2001 From: chenlei Date: Wed, 29 May 2019 21:03:31 +0800 Subject: [PATCH 024/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=AF=B9cook?= =?UTF-8?q?ie=E6=98=AF=E5=90=A6=E6=AD=A3=E7=A1=AE=E7=9A=84=E5=88=A4?= =?UTF-8?q?=E6=96=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/weiboSpider.py b/weiboSpider.py index 09d2b342..8f9f9fac 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -64,6 +64,8 @@ def get_nickname(self): selector = self.deal_html(url) nickname = selector.xpath("//title/text()")[0] self.nickname = nickname[:-3] + if self.nickname == u"登录 - 新": + sys.exit(u"cookie错误或已过期,请按照README中方法重新获取") print(u"用户昵称: " + self.nickname) except Exception as e: print("Error: ", e) From 14f236987955b8723319771b04f048cb6f4bebdb Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 29 May 2019 21:13:35 +0800 Subject: [PATCH 025/363] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 620ca169..7eddf3e1 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ txt文件结果如下所示: 用户id,例如新浪微博昵称为“Dear-迪丽热巴”的id为“1669879400” # 输出 -- 用户名:用户昵称,如"Dear-迪丽热巴" +- 昵称:用户昵称,如"Dear-迪丽热巴" - 微博数:用户的全部微博数(转发微博+原创微博) - 关注数:用户关注的微博账号数量 - 粉丝数:用户的粉丝数 @@ -52,7 +52,7 @@ wb = Weibo(user_id,filter) #调用Weibo类,创建微博实例wb wb.start() #爬取微博信息 ``` user_id可以改成任意合法的用户id(爬虫的微博id除外);filter默认值为0,表示爬取所有微博信息(转发微博+原创微博),为1表示只爬取用户的所有原创微博;wb是Weibo类的一个实例,也可以是其它名字,只要符合python的命名规范即可;通过执行wb.start() 完成了微博的爬取工作。在上述代码执行后,我们可以得到很多信息:
-**wb.username**:用户名;
+**wb.nickname**:用户昵称;
**wb.weibo_num**:微博数;
**wb.following**:关注数;
**wb.followers**:粉丝数;
From 574dbe6a8eb32b123f24e0a59269bd37a7a3785c Mon Sep 17 00:00:00 2001 From: chenlei Date: Thu, 30 May 2019 20:23:46 +0800 Subject: [PATCH 026/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 8f9f9fac..0fe60950 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -161,9 +161,9 @@ def get_retweet(self, info): if u"全文" in a_text: weibo_id = info.xpath("@id")[0][2:] weibo_link = "https://weibo.cn/comment/" + weibo_id - wb_content = self.get_long_retweet(weibo_link) - if wb_content: - weibo_content = wb_content + weibo_content = self.get_long_retweet(weibo_link) + if weibo_content: + wb_content = weibo_content retweet_reason = self.deal_garbled(info.xpath("div")[-1]) retweet_reason = retweet_reason[:retweet_reason.rindex(u"赞")] wb_content = (retweet_reason + "\n" + u"原始用户: " + @@ -423,7 +423,7 @@ def main(): try: # 使用实例,输入一个用户id,所有信息都会存储在wb实例中 user_id = 1669879400 # 可以改成任意合法的用户id(爬虫的微博id除外) - filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 + filter = 0 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 wb = Weibo(user_id, filter) # 调用Weibo类,创建微博实例wb wb.start() # 爬取微博信息 print(u"用户昵称: " + wb.nickname) From 11758655d6b653a37e938c02b854f4795b90e8e5 Mon Sep 17 00:00:00 2001 From: chenlei Date: Thu, 30 May 2019 20:36:30 +0800 Subject: [PATCH 027/363] =?UTF-8?q?style:=20=E4=BF=AE=E6=94=B9=E5=AF=BC?= =?UTF-8?q?=E5=85=A5=E6=A0=BC=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 0fe60950..71ad9638 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -6,13 +6,13 @@ import os import random import re -import requests import sys import traceback -from datetime import datetime -from datetime import timedelta -from lxml import etree +from datetime import datetime, timedelta from time import sleep + +import requests +from lxml import etree from tqdm import tqdm From 295c09fb8e19979ff7a442fc08ba78743c9c7053 Mon Sep 17 00:00:00 2001 From: chenlei Date: Fri, 31 May 2019 20:42:21 +0800 Subject: [PATCH 028/363] =?UTF-8?q?feat:=20=E5=A2=9E=E5=8A=A0=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E5=BE=AE=E5=8D=9Aid=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 132 +++++++++++++++++++++++++++---------------------- 1 file changed, 74 insertions(+), 58 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 71ad9638..c9bbf91b 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -23,11 +23,12 @@ def __init__(self, user_id, filter=0): """Weibo类初始化""" self.user_id = user_id # 用户id,即需要我们输入的数字,如昵称为“Dear-迪丽热巴”的id为1669879400 self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 - self.nickname = '' # 用户昵称,如“Dear-迪丽热巴” + self.nickname = "" # 用户昵称,如“Dear-迪丽热巴” self.weibo_num = 0 # 用户全部微博数 self.got_num = 0 # 爬取到的微博数 self.following = 0 # 用户关注数 self.followers = 0 # 用户粉丝数 + self.weibo_id = [] # 微博id self.weibo_content = [] # 微博内容 self.weibo_place = [] # 微博位置 self.publish_time = [] # 微博发布时间 @@ -49,9 +50,8 @@ def deal_html(self, url): def deal_garbled(self, info): """处理乱码""" try: - info = info.xpath( - "string(.)").replace(u"\u200b", "").encode(sys.stdout.encoding, "ignore").decode( - sys.stdout.encoding) + info = (info.xpath("string(.)").replace(u"\u200b", "").encode( + sys.stdout.encoding, "ignore").decode(sys.stdout.encoding)) return info except Exception as e: print("Error: ", e) @@ -96,8 +96,8 @@ def get_page_num(self, selector): if selector.xpath("//input[@name='mp']") == []: page_num = 1 else: - page_num = (int)(selector.xpath( - "//input[@name='mp']")[0].attrib["value"]) + page_num = (int)( + selector.xpath("//input[@name='mp']")[0].attrib["value"]) return page_num except Exception as e: print("Error: ", e) @@ -110,21 +110,20 @@ def get_long_weibo(self, weibo_link): info = selector.xpath("//div[@class='c']")[1] wb_content = self.deal_garbled(info) wb_time = info.xpath("//span[@class='ct']/text()")[0] - wb_content = wb_content[wb_content.find( - ":") + 1:wb_content.rfind(wb_time)] + wb_content = wb_content[wb_content.find(":") + + 1:wb_content.rfind(wb_time)] return wb_content except Exception as e: print("Error: ", e) traceback.print_exc() - def get_original_weibo(self, info): + def get_original_weibo(self, info, weibo_id): """获取原创微博""" try: weibo_content = self.deal_garbled(info) weibo_content = weibo_content[:weibo_content.rfind(u"赞")] a_text = info.xpath("div//a/text()") if u"全文" in a_text: - weibo_id = info.xpath("@id")[0][2:] weibo_link = "https://weibo.cn/comment/" + weibo_id wb_content = self.get_long_weibo(weibo_link) if wb_content: @@ -144,7 +143,7 @@ def get_long_retweet(self, weibo_link): print("Error: ", e) traceback.print_exc() - def get_retweet(self, info): + def get_retweet(self, info, weibo_id): """获取转发微博""" try: original_user = info.xpath("div/span[@class='cmt']/a/text()") @@ -154,20 +153,19 @@ def get_retweet(self, info): else: original_user = original_user[0] wb_content = self.deal_garbled(info) - wb_content = wb_content[wb_content.find( - ":") + 1:wb_content.rfind(u"赞")] + wb_content = wb_content[wb_content.find(":") + + 1:wb_content.rfind(u"赞")] wb_content = wb_content[:wb_content.rfind(u"赞")] a_text = info.xpath("div//a/text()") if u"全文" in a_text: - weibo_id = info.xpath("@id")[0][2:] weibo_link = "https://weibo.cn/comment/" + weibo_id weibo_content = self.get_long_retweet(weibo_link) if weibo_content: wb_content = weibo_content retweet_reason = self.deal_garbled(info.xpath("div")[-1]) retweet_reason = retweet_reason[:retweet_reason.rindex(u"赞")] - wb_content = (retweet_reason + "\n" + u"原始用户: " + - original_user + "\n" + u"转发内容: " + wb_content) + wb_content = (retweet_reason + "\n" + u"原始用户: " + original_user + + "\n" + u"转发内容: " + wb_content) return wb_content except Exception as e: print("Error: ", e) @@ -176,11 +174,13 @@ def get_retweet(self, info): def get_weibo_content(self, info): """获取微博内容""" try: + weibo_id = info.xpath("@id")[0][2:] + self.weibo_id.append(weibo_id) is_retweet = info.xpath("div/span[@class='cmt']") if is_retweet: - weibo_content = self.get_retweet(info) + weibo_content = self.get_retweet(info, weibo_id) else: - weibo_content = self.get_original_weibo(info) + weibo_content = self.get_original_weibo(info, weibo_id) self.weibo_content.append(weibo_content) print(weibo_content) except Exception as e: @@ -194,12 +194,13 @@ def get_weibo_place(self, info): a_list = div_first.xpath("a") weibo_place = u"无" for a in a_list: - if ("place.weibo.com" in a.xpath("@href")[0] and - a.xpath("text()")[0] == u"显示地图"): + if ("place.weibo.com" in a.xpath("@href")[0] + and a.xpath("text()")[0] == u"显示地图"): weibo_a = div_first.xpath("span[@class='ctt']/a") if len(weibo_a) >= 1: weibo_place = weibo_a[-1] - if u"视频" == div_first.xpath("span[@class='ctt']/a/text()")[-1][-2:]: + if (u"视频" == div_first.xpath( + "span[@class='ctt']/a/text()")[-1][-2:]): if len(weibo_a) >= 2: weibo_place = weibo_a[-2] else: @@ -217,15 +218,14 @@ def get_publish_time(self, info): try: str_time = info.xpath("div/span[@class='ct']") str_time = self.deal_garbled(str_time[0]) - publish_time = str_time.split(u'来自')[0] + publish_time = str_time.split(u"来自")[0] if u"刚刚" in publish_time: - publish_time = datetime.now().strftime( - '%Y-%m-%d %H:%M') + publish_time = datetime.now().strftime("%Y-%m-%d %H:%M") elif u"分钟" in publish_time: minute = publish_time[:publish_time.find(u"分钟")] minute = timedelta(minutes=int(minute)) - publish_time = (datetime.now() - minute).strftime( - "%Y-%m-%d %H:%M") + publish_time = (datetime.now() - + minute).strftime("%Y-%m-%d %H:%M") elif u"今天" in publish_time: today = datetime.now().strftime("%Y-%m-%d") time = publish_time[3:] @@ -235,7 +235,7 @@ def get_publish_time(self, info): month = publish_time[0:2] day = publish_time[3:5] time = publish_time[7:12] - publish_time = (year + "-" + month + "-" + day + " " + time) + publish_time = year + "-" + month + "-" + day + " " + time else: publish_time = publish_time[:16] self.publish_time.append(publish_time) @@ -249,8 +249,8 @@ def get_publish_tool(self, info): try: str_time = info.xpath("div/span[@class='ct']") str_time = self.deal_garbled(str_time[0]) - if len(str_time.split(u'来自')) > 1: - publish_tool = str_time.split(u'来自')[1] + if len(str_time.split(u"来自")) > 1: + publish_tool = str_time.split(u"来自")[1] else: publish_tool = u"无" self.publish_tool.append(publish_tool) @@ -265,7 +265,7 @@ def get_weibo_footer(self, info): pattern = r"\d+" str_footer = info.xpath("div")[-1] str_footer = self.deal_garbled(str_footer) - str_footer = str_footer[str_footer.rfind(u'赞'):] + str_footer = str_footer[str_footer.rfind(u"赞"):] weibo_footer = re.findall(pattern, str_footer, re.M) up_num = int(weibo_footer[0]) @@ -336,8 +336,8 @@ def get_weibo_info(self): def get_filepath(self, type): """获取结果文件路径""" try: - file_dir = os.path.split(os.path.realpath(__file__))[ - 0] + os.sep + "weibo" + file_dir = os.path.split( + os.path.realpath(__file__))[0] + os.sep + "weibo" if not os.path.isdir(file_dir): os.mkdir(file_dir) file_path = file_dir + os.sep + "%d" % self.user_id + "." + type @@ -354,24 +354,21 @@ def write_txt(self): else: result_header = u"\n\n微博内容: \n" temp_result = [] - temp_result.append(u"用户信息\n用户昵称:" + self.nickname + - u"\n用户id: " + str(self.user_id) + - u"\n微博数: " + str(self.weibo_num) + - u"\n关注数: " + str(self.following) + - u"\n粉丝数: " + str(self.followers) + - result_header - ) + temp_result.append(u"用户信息\n用户昵称:" + self.nickname + u"\n用户id: " + + str(self.user_id) + u"\n微博数: " + + str(self.weibo_num) + u"\n关注数: " + + str(self.following) + u"\n粉丝数: " + + str(self.followers) + result_header) for i in range(1, self.got_num + 1): - temp_result.append(str(i) + ":" + self.weibo_content[i - 1] + "\n" + - u"微博位置: " + self.weibo_place[i - 1] + "\n" + - u"发布时间: " + self.publish_time[i - 1] + "\n" + - u"点赞数: " + str(self.up_num[i - 1]) + - u" 转发数: " + str(self.retweet_num[i - 1]) + - u" 评论数: " + str(self.comment_num[i - 1]) + "\n" + - u"发布工具: " + - self.publish_tool[i - 1] + "\n\n" - ) - result = ''.join(temp_result) + temp_result.append( + str(i) + ":" + self.weibo_content[i - 1] + "\n" + + u"微博位置: " + self.weibo_place[i - 1] + "\n" + u"发布时间: " + + self.publish_time[i - 1] + "\n" + u"点赞数: " + + str(self.up_num[i - 1]) + u" 转发数: " + + str(self.retweet_num[i - 1]) + u" 评论数: " + + str(self.comment_num[i - 1]) + "\n" + u"发布工具: " + + self.publish_tool[i - 1] + "\n\n") + result = "".join(temp_result) with open(self.get_filepath("txt"), "wb") as f: f.write(result.encode(sys.stdout.encoding)) print(u"微博写入txt文件完毕,保存路径:") @@ -383,20 +380,39 @@ def write_txt(self): def write_csv(self): """将爬取的信息写入csv文件""" try: - result_headers = ["微博正文", "发布位置", - "发布时间", "发布工具", "点赞数", "转发数", "评论数"] - result_data = zip(self.weibo_content, self.weibo_place, self.publish_time, - self.publish_tool, self.up_num, self.retweet_num, self.comment_num) - if sys.version < '3': # python2.x + result_headers = [ + "微博id", + "微博正文", + "发布位置", + "发布时间", + "发布工具", + "点赞数", + "转发数", + "评论数", + ] + result_data = zip( + self.weibo_id, + self.weibo_content, + self.weibo_place, + self.publish_time, + self.publish_tool, + self.up_num, + self.retweet_num, + self.comment_num, + ) + if sys.version < "3": # python2.x reload(sys) - sys.setdefaultencoding('utf-8') + sys.setdefaultencoding("utf-8") with open(self.get_filepath("csv"), "wb") as f: f.write(codecs.BOM_UTF8) writer = csv.writer(f) writer.writerows([result_headers]) writer.writerows(result_data) - else: # python3.x - with open(self.get_filepath("csv"), "w", encoding="utf-8-sig", newline="") as f: + else: # python3.x + with open(self.get_filepath("csv"), + "w", + encoding="utf-8-sig", + newline="") as f: writer = csv.writer(f) writer.writerows([result_headers]) writer.writerows(result_data) @@ -423,7 +439,7 @@ def main(): try: # 使用实例,输入一个用户id,所有信息都会存储在wb实例中 user_id = 1669879400 # 可以改成任意合法的用户id(爬虫的微博id除外) - filter = 0 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 + filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 wb = Weibo(user_id, filter) # 调用Weibo类,创建微博实例wb wb.start() # 爬取微博信息 print(u"用户昵称: " + wb.nickname) From bcf414a3bc4db7753d50cbe6493bc4aed6c6a86d Mon Sep 17 00:00:00 2001 From: chenlei Date: Sun, 2 Jun 2019 13:18:59 +0800 Subject: [PATCH 029/363] =?UTF-8?q?fix:=20=E5=B0=86=E5=86=99=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E6=94=B9=E6=88=90=E6=AF=8F=E7=88=AC20=E9=A1=B5?= =?UTF-8?q?=E5=BE=AE=E5=8D=9A=E5=86=99=E5=85=A5=E4=B8=80=E6=AC=A1=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #62 --- weiboSpider.py | 164 +++++++++++++++++++++++++++---------------------- 1 file changed, 89 insertions(+), 75 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index c9bbf91b..086bf6b3 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -305,34 +305,6 @@ def get_one_page(self, page): print("Error: ", e) traceback.print_exc() - def get_weibo_info(self): - """获取微博信息""" - try: - url = "https://weibo.cn/u/%d" % (self.user_id) - selector = self.deal_html(url) - self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 - page_num = self.get_page_num(selector) # 获取微博总页数 - page1 = 0 - random_pages = random.randint(1, 5) - for page in tqdm(range(1, page_num + 1), desc=u"进度"): - self.get_one_page(page) # 获取第page页的全部微博 - - # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限 - # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默 - # 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间 - if page - page1 == random_pages: - sleep(random.randint(6, 10)) - page1 = page - random_pages = random.randint(1, 5) - - if not self.filter: - print(u"共爬取" + str(self.got_num) + u"条微博") - else: - print(u"共爬取" + str(self.got_num) + u"条原创微博") - except Exception as e: - print("Error: ", e) - traceback.print_exc() - def get_filepath(self, type): """获取结果文件路径""" try: @@ -346,38 +318,7 @@ def get_filepath(self, type): print("Error: ", e) traceback.print_exc() - def write_txt(self): - """将爬取的信息写入txt文件""" - try: - if self.filter: - result_header = u"\n\n原创微博内容: \n" - else: - result_header = u"\n\n微博内容: \n" - temp_result = [] - temp_result.append(u"用户信息\n用户昵称:" + self.nickname + u"\n用户id: " + - str(self.user_id) + u"\n微博数: " + - str(self.weibo_num) + u"\n关注数: " + - str(self.following) + u"\n粉丝数: " + - str(self.followers) + result_header) - for i in range(1, self.got_num + 1): - temp_result.append( - str(i) + ":" + self.weibo_content[i - 1] + "\n" + - u"微博位置: " + self.weibo_place[i - 1] + "\n" + u"发布时间: " + - self.publish_time[i - 1] + "\n" + u"点赞数: " + - str(self.up_num[i - 1]) + u" 转发数: " + - str(self.retweet_num[i - 1]) + u" 评论数: " + - str(self.comment_num[i - 1]) + "\n" + u"发布工具: " + - self.publish_tool[i - 1] + "\n\n") - result = "".join(temp_result) - with open(self.get_filepath("txt"), "wb") as f: - f.write(result.encode(sys.stdout.encoding)) - print(u"微博写入txt文件完毕,保存路径:") - print(self.get_filepath("txt")) - except Exception as e: - print("Error: ", e) - traceback.print_exc() - - def write_csv(self): + def write_csv(self, wrote_num): """将爬取的信息写入csv文件""" try: result_headers = [ @@ -391,43 +332,116 @@ def write_csv(self): "评论数", ] result_data = zip( - self.weibo_id, - self.weibo_content, - self.weibo_place, - self.publish_time, - self.publish_tool, - self.up_num, - self.retweet_num, - self.comment_num, + self.weibo_id[wrote_num:], + self.weibo_content[wrote_num:], + self.weibo_place[wrote_num:], + self.publish_time[wrote_num:], + self.publish_tool[wrote_num:], + self.up_num[wrote_num:], + self.retweet_num[wrote_num:], + self.comment_num[wrote_num:], ) if sys.version < "3": # python2.x reload(sys) sys.setdefaultencoding("utf-8") - with open(self.get_filepath("csv"), "wb") as f: + with open(self.get_filepath("csv"), "ab") as f: f.write(codecs.BOM_UTF8) writer = csv.writer(f) - writer.writerows([result_headers]) + if wrote_num == 0: + writer.writerows([result_headers]) writer.writerows(result_data) else: # python3.x with open(self.get_filepath("csv"), - "w", + "a", encoding="utf-8-sig", newline="") as f: writer = csv.writer(f) - writer.writerows([result_headers]) + if wrote_num == 0: + writer.writerows([result_headers]) writer.writerows(result_data) - print(u"微博写入csv文件完毕,保存路径:") + print(u"%d条微博写入csv文件完毕,保存路径:" % self.got_num) print(self.get_filepath("csv")) except Exception as e: print("Error: ", e) traceback.print_exc() + def write_txt(self, wrote_num): + """将爬取的信息写入txt文件""" + try: + temp_result = [] + if wrote_num == 0: + if self.filter: + result_header = u"\n\n原创微博内容: \n" + else: + result_header = u"\n\n微博内容: \n" + result_header = (u"用户信息\n用户昵称:" + self.nickname + u"\n用户id: " + + str(self.user_id) + u"\n微博数: " + + str(self.weibo_num) + u"\n关注数: " + + str(self.following) + u"\n粉丝数: " + + str(self.followers) + result_header) + temp_result.append(result_header) + for i in range(wrote_num, self.got_num): + temp_result.append( + str(i + 1) + ":" + self.weibo_content[i] + "\n" + + u"微博位置: " + self.weibo_place[i] + "\n" + u"发布时间: " + + self.publish_time[i] + "\n" + u"点赞数: " + + str(self.up_num[i]) + u" 转发数: " + + str(self.retweet_num[i]) + u" 评论数: " + + str(self.comment_num[i]) + "\n" + u"发布工具: " + + self.publish_tool[i] + "\n\n") + result = "".join(temp_result) + with open(self.get_filepath("txt"), "ab") as f: + f.write(result.encode(sys.stdout.encoding)) + print(u"%d条微博写入txt文件完毕,保存路径:" % self.got_num) + print(self.get_filepath("txt")) + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def write_file(self, wrote_num): + """写文件""" + if self.got_num > wrote_num: + self.write_csv(wrote_num) + self.write_txt(wrote_num) + + def get_weibo_info(self): + """获取微博信息""" + try: + url = "https://weibo.cn/u/%d" % (self.user_id) + selector = self.deal_html(url) + self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 + page_num = self.get_page_num(selector) # 获取微博总页数 + wrote_num = 0 + page1 = 0 + random_pages = random.randint(1, 5) + for page in tqdm(range(1, page_num + 1), desc=u"进度"): + self.get_one_page(page) # 获取第page页的全部微博 + + if page % 20 == 0: # 每爬20页写入一次文件 + self.write_file(wrote_num) + wrote_num = self.got_num + + # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限 + # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默 + # 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间 + if page - page1 == random_pages: + sleep(random.randint(6, 10)) + page1 = page + random_pages = random.randint(1, 5) + + self.write_file(wrote_num) # 将剩余不足20页的微博写入文件 + if not self.filter: + print(u"共爬取" + str(self.got_num) + u"条微博") + else: + print(u"共爬取" + str(self.got_num) + u"条原创微博") + except Exception as e: + print("Error: ", e) + traceback.print_exc() + def start(self): """运行爬虫""" try: self.get_weibo_info() - self.write_txt() - self.write_csv() print(u"信息抓取完毕") print("*" * 100) except Exception as e: From 32e28fe99d33791c0beca5b114c85b3a3237a2d8 Mon Sep 17 00:00:00 2001 From: chenlei Date: Tue, 4 Jun 2019 01:54:46 +0800 Subject: [PATCH 030/363] =?UTF-8?q?feat:=20=E5=A2=9E=E5=8A=A0=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E5=8E=9F=E5=88=9B=E5=BE=AE=E5=8D=9A=E5=8E=9F=E5=A7=8B?= =?UTF-8?q?=E5=9B=BE=E7=89=87url=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/weiboSpider.py b/weiboSpider.py index 086bf6b3..19881543 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -30,6 +30,7 @@ def __init__(self, user_id, filter=0): self.followers = 0 # 用户粉丝数 self.weibo_id = [] # 微博id self.weibo_content = [] # 微博内容 + self.weibo_pictures = [] # 微博原始图片的url self.weibo_place = [] # 微博位置 self.publish_time = [] # 微博发布时间 self.up_num = [] # 微博对应的点赞数 @@ -283,6 +284,26 @@ def get_weibo_footer(self, info): print("Error: ", e) traceback.print_exc() + def get_picture_urls(self, info): + """获取微博原始图片url""" + weibo_id = info.xpath("@id")[0][2:] + a_list = info.xpath("./div/a/@href") + first_pic = "https://weibo.cn/mblog/pic/" + weibo_id + "?rl=0" + all_pic = "https://weibo.cn/mblog/picAll/" + weibo_id + "?rl=1" + if first_pic in a_list: + if all_pic in a_list: + selector = self.deal_html(all_pic) + preview_picture = selector.xpath("//img/@src") + original_picture = [ + p.replace("thumb180", "large") for p in preview_picture + ] + else: + preview_picture = info.xpath("./div/a/img/@src")[0] + original_picture = preview_picture.replace("wap180", "large") + else: + original_picture = "无" + self.weibo_pictures.append(original_picture) + def get_one_page(self, page): """获取第page页的全部微博""" try: @@ -295,6 +316,7 @@ def get_one_page(self, page): is_retweet = info[i].xpath("div/span[@class='cmt']") if (not self.filter) or (not is_retweet): self.get_weibo_content(info[i]) # 微博内容 + self.get_picture_urls(info[i]) # 微博原始图片url self.get_weibo_place(info[i]) # 微博位置 self.get_publish_time(info[i]) # 微博发布时间 self.get_publish_tool(info[i]) # 微博发布工具 @@ -324,6 +346,7 @@ def write_csv(self, wrote_num): result_headers = [ "微博id", "微博正文", + "原始图片url", "发布位置", "发布时间", "发布工具", @@ -334,6 +357,7 @@ def write_csv(self, wrote_num): result_data = zip( self.weibo_id[wrote_num:], self.weibo_content[wrote_num:], + self.weibo_pictures[wrote_num:], self.weibo_place[wrote_num:], self.publish_time[wrote_num:], self.publish_tool[wrote_num:], From 65b0468987c4332de4e0a122f489f2cd3a4df478 Mon Sep 17 00:00:00 2001 From: chenlei Date: Tue, 4 Jun 2019 18:44:47 +0800 Subject: [PATCH 031/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E7=88=AC?= =?UTF-8?q?=E8=99=AB=E5=BE=AE=E5=8D=9A=E5=AF=B9=E5=8E=9F=E5=88=9B=E5=BE=AE?= =?UTF-8?q?=E5=8D=9A=E7=82=B9=E8=B5=9E=E7=AD=89=E6=93=8D=E4=BD=9C=E5=90=8E?= =?UTF-8?q?=E8=AF=AF=E8=AE=A4=E4=B8=BA=E8=BD=AC=E5=8F=91=E5=BE=AE=E5=8D=9A?= =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 19881543..b1698073 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -172,12 +172,20 @@ def get_retweet(self, info, weibo_id): print("Error: ", e) traceback.print_exc() + def is_retweet(self, info): + """判断微博是否为转发微博""" + is_retweet = info.xpath("div/span[@class='cmt']") + if len(is_retweet) > 3: + return True + else: + return False + def get_weibo_content(self, info): """获取微博内容""" try: weibo_id = info.xpath("@id")[0][2:] self.weibo_id.append(weibo_id) - is_retweet = info.xpath("div/span[@class='cmt']") + is_retweet = self.is_retweet(info) if is_retweet: weibo_content = self.get_retweet(info, weibo_id) else: @@ -287,7 +295,7 @@ def get_weibo_footer(self, info): def get_picture_urls(self, info): """获取微博原始图片url""" weibo_id = info.xpath("@id")[0][2:] - a_list = info.xpath("./div/a/@href") + a_list = info.xpath("div/a/@href") first_pic = "https://weibo.cn/mblog/pic/" + weibo_id + "?rl=0" all_pic = "https://weibo.cn/mblog/picAll/" + weibo_id + "?rl=1" if first_pic in a_list: @@ -295,10 +303,10 @@ def get_picture_urls(self, info): selector = self.deal_html(all_pic) preview_picture = selector.xpath("//img/@src") original_picture = [ - p.replace("thumb180", "large") for p in preview_picture + p.replace("/thumb180/", "/large/") for p in preview_picture ] else: - preview_picture = info.xpath("./div/a/img/@src")[0] + preview_picture = info.xpath("div/a/img/@src")[0] original_picture = preview_picture.replace("wap180", "large") else: original_picture = "无" @@ -313,7 +321,7 @@ def get_one_page(self, page): is_empty = info[0].xpath("div/span[@class='ctt']") if is_empty: for i in range(0, len(info) - 2): - is_retweet = info[i].xpath("div/span[@class='cmt']") + is_retweet = self.is_retweet(info[i]) if (not self.filter) or (not is_retweet): self.get_weibo_content(info[i]) # 微博内容 self.get_picture_urls(info[i]) # 微博原始图片url From 0eb58e82cc40f66a8ebb3a06453e08303f163f5d Mon Sep 17 00:00:00 2001 From: chenlei Date: Wed, 5 Jun 2019 02:14:23 +0800 Subject: [PATCH 032/363] =?UTF-8?q?feat:=20=E5=A2=9E=E5=8A=A0=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E5=BE=AE=E5=8D=9A=E5=8E=9F=E5=A7=8B=E5=9B=BE=E7=89=87?= =?UTF-8?q?url=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 包括如下原始图片的url: - 原创微博中的图片 - 被转发微博中的图片 - 转发理由中的图片 Issue #12 --- weiboSpider.py | 150 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 101 insertions(+), 49 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index b1698073..4137113e 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -21,16 +21,18 @@ class Weibo: def __init__(self, user_id, filter=0): """Weibo类初始化""" - self.user_id = user_id # 用户id,即需要我们输入的数字,如昵称为“Dear-迪丽热巴”的id为1669879400 - self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 - self.nickname = "" # 用户昵称,如“Dear-迪丽热巴” + self.user_id = user_id # 用户id,即需要我们输入的数字,如昵称为"Dear-迪丽热巴"的id为1669879400 + self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 + self.nickname = "" # 用户昵称,如“Dear-迪丽热巴” self.weibo_num = 0 # 用户全部微博数 self.got_num = 0 # 爬取到的微博数 self.following = 0 # 用户关注数 self.followers = 0 # 用户粉丝数 self.weibo_id = [] # 微博id self.weibo_content = [] # 微博内容 - self.weibo_pictures = [] # 微博原始图片的url + self.weibo_pictures = [] # 微博原始图片的url,包括原创微博的原始图片url和转发微博"转发理由"中图片的url + self.retweet_pictures = [] # 被转发微博中原始图片的url + self.original = [] # 是否为原创微博 self.weibo_place = [] # 微博位置 self.publish_time = [] # 微博发布时间 self.up_num = [] # 微博对应的点赞数 @@ -172,24 +174,23 @@ def get_retweet(self, info, weibo_id): print("Error: ", e) traceback.print_exc() - def is_retweet(self, info): - """判断微博是否为转发微博""" - is_retweet = info.xpath("div/span[@class='cmt']") - if len(is_retweet) > 3: - return True - else: + def is_original(self, info): + """判断微博是否为原创微博""" + is_original = info.xpath("div/span[@class='cmt']") + if len(is_original) > 3: return False + else: + return True - def get_weibo_content(self, info): + def get_weibo_content(self, info, is_original): """获取微博内容""" try: weibo_id = info.xpath("@id")[0][2:] self.weibo_id.append(weibo_id) - is_retweet = self.is_retweet(info) - if is_retweet: - weibo_content = self.get_retweet(info, weibo_id) - else: + if is_original: weibo_content = self.get_original_weibo(info, weibo_id) + else: + weibo_content = self.get_retweet(info, weibo_id) self.weibo_content.append(weibo_content) print(weibo_content) except Exception as e: @@ -292,25 +293,47 @@ def get_weibo_footer(self, info): print("Error: ", e) traceback.print_exc() - def get_picture_urls(self, info): - """获取微博原始图片url""" - weibo_id = info.xpath("@id")[0][2:] + def extract_picture_urls(self, info, weibo_id): + """提取微博原始图片url""" a_list = info.xpath("div/a/@href") first_pic = "https://weibo.cn/mblog/pic/" + weibo_id + "?rl=0" all_pic = "https://weibo.cn/mblog/picAll/" + weibo_id + "?rl=1" if first_pic in a_list: if all_pic in a_list: selector = self.deal_html(all_pic) - preview_picture = selector.xpath("//img/@src") - original_picture = [ - p.replace("/thumb180/", "/large/") for p in preview_picture + preview_picture_list = selector.xpath("//img/@src") + picture_list = [ + p.replace("/thumb180/", "/large/") + for p in preview_picture_list ] + picture = ",".join(picture_list) else: preview_picture = info.xpath("div/a/img/@src")[0] - original_picture = preview_picture.replace("wap180", "large") + picture = preview_picture.replace("/wap180/", "/large/") else: + picture = "无" + return picture + + def get_picture_urls(self, info, is_original): + """获取微博原始图片url""" + weibo_id = info.xpath("@id")[0][2:] + if is_original: + original_picture = self.extract_picture_urls(info, weibo_id) + self.weibo_pictures.append(original_picture) + if not self.filter: + self.retweet_pictures.append("无") + else: + retweet_url = info.xpath("div/a[@class='cc']/@href")[0] + retweet_id = retweet_url.split("/")[-1].split("?")[0] + retweet_pictures = self.extract_picture_urls(info, retweet_id) + self.retweet_pictures.append(retweet_pictures) + a_list = info.xpath("div[last()]/a/@href") original_picture = "无" - self.weibo_pictures.append(original_picture) + for a in a_list: + if a.endswith(".jpg"): + original_picture = a + break + self.weibo_pictures.append(original_picture) def get_one_page(self, page): """获取第page页的全部微博""" @@ -321,10 +344,11 @@ def get_one_page(self, page): is_empty = info[0].xpath("div/span[@class='ctt']") if is_empty: for i in range(0, len(info) - 2): - is_retweet = self.is_retweet(info[i]) - if (not self.filter) or (not is_retweet): - self.get_weibo_content(info[i]) # 微博内容 - self.get_picture_urls(info[i]) # 微博原始图片url + is_original = self.is_original(info[i]) + self.original.append(is_original) + if (not self.filter) or is_original: + self.get_weibo_content(info[i], is_original) # 微博内容 + self.get_picture_urls(info[i], is_original) # 微博图片url self.get_weibo_place(info[i]) # 微博位置 self.get_publish_time(info[i]) # 微博发布时间 self.get_publish_tool(info[i]) # 微博发布工具 @@ -351,28 +375,56 @@ def get_filepath(self, type): def write_csv(self, wrote_num): """将爬取的信息写入csv文件""" try: - result_headers = [ - "微博id", - "微博正文", - "原始图片url", - "发布位置", - "发布时间", - "发布工具", - "点赞数", - "转发数", - "评论数", - ] - result_data = zip( - self.weibo_id[wrote_num:], - self.weibo_content[wrote_num:], - self.weibo_pictures[wrote_num:], - self.weibo_place[wrote_num:], - self.publish_time[wrote_num:], - self.publish_tool[wrote_num:], - self.up_num[wrote_num:], - self.retweet_num[wrote_num:], - self.comment_num[wrote_num:], - ) + if self.filter: + result_headers = [ + "微博id", + "微博正文", + "原始图片url", + "发布位置", + "发布时间", + "发布工具", + "点赞数", + "转发数", + "评论数", + ] + result_data = zip( + self.weibo_id[wrote_num:], + self.weibo_content[wrote_num:], + self.weibo_pictures[wrote_num:], + self.weibo_place[wrote_num:], + self.publish_time[wrote_num:], + self.publish_tool[wrote_num:], + self.up_num[wrote_num:], + self.retweet_num[wrote_num:], + self.comment_num[wrote_num:], + ) + else: + result_headers = [ + "微博id", + "微博正文", + "原始图片url", + "被转发微博原始图片url", + "是否为原创微博", + "发布位置", + "发布时间", + "发布工具", + "点赞数", + "转发数", + "评论数", + ] + result_data = zip( + self.weibo_id[wrote_num:], + self.weibo_content[wrote_num:], + self.weibo_pictures[wrote_num:], + self.retweet_pictures[wrote_num:], + self.original[wrote_num:], + self.weibo_place[wrote_num:], + self.publish_time[wrote_num:], + self.publish_tool[wrote_num:], + self.up_num[wrote_num:], + self.retweet_num[wrote_num:], + self.comment_num[wrote_num:], + ) if sys.version < "3": # python2.x reload(sys) sys.setdefaultencoding("utf-8") From 042810dbcdd1b5cd13dc0c01e4932cc619165cfe Mon Sep 17 00:00:00 2001 From: chenlei Date: Sun, 9 Jun 2019 13:25:48 +0800 Subject: [PATCH 033/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E9=83=A8?= =?UTF-8?q?=E5=88=86=E7=B3=BB=E7=BB=9F=E8=8E=B7=E5=8F=96=E5=BE=AE=E5=8D=9A?= =?UTF-8?q?=E5=8E=9F=E5=A7=8B=E5=9B=BE=E7=89=87url=E5=87=BA=E9=94=99?= =?UTF-8?q?=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 79 ++++++++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 35 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 4137113e..a236c910 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -295,45 +295,54 @@ def get_weibo_footer(self, info): def extract_picture_urls(self, info, weibo_id): """提取微博原始图片url""" - a_list = info.xpath("div/a/@href") - first_pic = "https://weibo.cn/mblog/pic/" + weibo_id + "?rl=0" - all_pic = "https://weibo.cn/mblog/picAll/" + weibo_id + "?rl=1" - if first_pic in a_list: - if all_pic in a_list: - selector = self.deal_html(all_pic) - preview_picture_list = selector.xpath("//img/@src") - picture_list = [ - p.replace("/thumb180/", "/large/") - for p in preview_picture_list - ] - picture = ",".join(picture_list) + try: + a_list = info.xpath("div/a/@href") + first_pic = "https://weibo.cn/mblog/pic/" + weibo_id + "?rl=0" + all_pic = "https://weibo.cn/mblog/picAll/" + weibo_id + "?rl=1" + if first_pic in a_list: + if all_pic in a_list: + selector = self.deal_html(all_pic) + preview_picture_list = selector.xpath("//img/@src") + picture_list = [ + p.replace("/thumb180/", "/large/") + for p in preview_picture_list + ] + picture_urls = ",".join(picture_list) + else: + preview_picture = info.xpath(".//img/@src")[-1] + picture_urls = preview_picture.replace( + "/wap180/", "/large/") else: - preview_picture = info.xpath("div/a/img/@src")[0] - picture = preview_picture.replace("/wap180/", "/large/") - else: - picture = "无" - return picture + picture_urls = "无" + return picture_urls + except Exception as e: + print("Error: ", e) + traceback.print_exc() def get_picture_urls(self, info, is_original): """获取微博原始图片url""" - weibo_id = info.xpath("@id")[0][2:] - if is_original: - original_picture = self.extract_picture_urls(info, weibo_id) - self.weibo_pictures.append(original_picture) - if not self.filter: - self.retweet_pictures.append("无") - else: - retweet_url = info.xpath("div/a[@class='cc']/@href")[0] - retweet_id = retweet_url.split("/")[-1].split("?")[0] - retweet_pictures = self.extract_picture_urls(info, retweet_id) - self.retweet_pictures.append(retweet_pictures) - a_list = info.xpath("div[last()]/a/@href") - original_picture = "无" - for a in a_list: - if a.endswith(".jpg"): - original_picture = a - break - self.weibo_pictures.append(original_picture) + try: + weibo_id = info.xpath("@id")[0][2:] + if is_original: + original_pictures = self.extract_picture_urls(info, weibo_id) + self.weibo_pictures.append(original_pictures) + if not self.filter: + self.retweet_pictures.append("无") + else: + retweet_url = info.xpath("div/a[@class='cc']/@href")[0] + retweet_id = retweet_url.split("/")[-1].split("?")[0] + retweet_pictures = self.extract_picture_urls(info, retweet_id) + self.retweet_pictures.append(retweet_pictures) + a_list = info.xpath("div[last()]/a/@href") + original_picture = "无" + for a in a_list: + if a.endswith(".jpg"): + original_picture = a + break + self.weibo_pictures.append(original_picture) + except Exception as e: + print("Error: ", e) + traceback.print_exc() def get_one_page(self, page): """获取第page页的全部微博""" From 92cd323000c941a9d909a8f997c3c20e079d0374 Mon Sep 17 00:00:00 2001 From: chenlei Date: Sun, 9 Jun 2019 17:19:25 +0800 Subject: [PATCH 034/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E9=83=A8?= =?UTF-8?q?=E5=88=86=E7=94=A8=E6=88=B7cookie=E8=BF=87=E6=9C=9F=E6=97=B6?= =?UTF-8?q?=E6=97=A0=E6=B3=95=E6=AD=A3=E7=A1=AE=E6=8F=90=E7=A4=BA=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weiboSpider.py b/weiboSpider.py index a236c910..613e5ca0 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -67,7 +67,7 @@ def get_nickname(self): selector = self.deal_html(url) nickname = selector.xpath("//title/text()")[0] self.nickname = nickname[:-3] - if self.nickname == u"登录 - 新": + if self.nickname == u"登录 - 新" or self.nickname == u"新浪": sys.exit(u"cookie错误或已过期,请按照README中方法重新获取") print(u"用户昵称: " + self.nickname) except Exception as e: From bc44cb9857b661a33d5c4ac840247383e9e1b1aa Mon Sep 17 00:00:00 2001 From: chenlei Date: Sun, 9 Jun 2019 19:18:05 +0800 Subject: [PATCH 035/363] =?UTF-8?q?feat:=20=E5=A2=9E=E5=8A=A0=E4=B8=8B?= =?UTF-8?q?=E8=BD=BD=E5=BE=AE=E5=8D=9A=E5=8E=9F=E5=A7=8B=E5=9B=BE=E7=89=87?= =?UTF-8?q?=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 包括: - 原创微博原始图片 - 转发微博转发理由中的原始图片 图片命名格式为: user_id+"_"+yyyymmdd+"_"+weibo_id + 后缀(.jpg或.gif等),如果一条微博有多张图片,还要在weibo_id后加上此图片在所有图片中的次序 Issue #12 --- weiboSpider.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/weiboSpider.py b/weiboSpider.py index 613e5ca0..4b32b856 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -344,6 +344,49 @@ def get_picture_urls(self, info, is_original): print("Error: ", e) traceback.print_exc() + def download_pic(self, url, pic_name): + """下载单张图片""" + try: + p = requests.get(url) + with open("img/" + pic_name, "wb") as f: + f.write(p.content) + except Exception as e: + with open("img/not_downloaded_pictures.txt", "ab") as f: + url = url + "\n" + f.write(url.encode(sys.stdout.encoding)) + print("Error: ", e) + traceback.print_exc() + + def download_pictures(self): + """下载微博图片""" + try: + print(u"即将进行图片下载") + file_dir = os.path.split( + os.path.realpath(__file__))[0] + os.sep + "img" + if not os.path.isdir(file_dir): + os.mkdir(file_dir) + for i, urls in enumerate(tqdm(self.weibo_pictures, + desc=u"图片下载进度")): + if urls != "无": + pic_prefix = str(self.user_id) + "_" + self.publish_time[ + i][:][:11].replace("-", "") + "_" + self.weibo_id[i] + if "," in urls: + urls = urls.split(",") + if isinstance(urls, list): + for j, url in enumerate(urls): + pic_suffix = url[url.rfind("."):] + pic_name = pic_prefix + "_" + str(j) + pic_suffix + self.download_pic(url, pic_name) + else: + pic_suffix = urls[urls.rfind("."):] + pic_name = pic_prefix + pic_suffix + self.download_pic(urls, pic_name) + print(u"图片下载完毕,保存路径:") + print(file_dir) + except Exception as e: + print("Error: ", e) + traceback.print_exc() + def get_one_page(self, page): """获取第page页的全部微博""" try: @@ -537,6 +580,7 @@ def start(self): self.get_weibo_info() print(u"信息抓取完毕") print("*" * 100) + self.download_pictures() except Exception as e: print("Error: ", e) traceback.print_exc() From aea448ee3bf4310519c64c6a42d4969b2d39e25b Mon Sep 17 00:00:00 2001 From: chenlei Date: Sun, 9 Jun 2019 20:33:03 +0800 Subject: [PATCH 036/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E8=8E=B7=E5=8F=96=E8=BD=AC=E5=8F=91=E7=90=86=E7=94=B1?= =?UTF-8?q?=E4=B8=AD=E9=83=A8=E5=88=86=E6=A0=BC=E5=BC=8F=E7=9A=84=E5=9B=BE?= =?UTF-8?q?=E7=89=87url?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weiboSpider.py b/weiboSpider.py index 4b32b856..6588c737 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -336,7 +336,7 @@ def get_picture_urls(self, info, is_original): a_list = info.xpath("div[last()]/a/@href") original_picture = "无" for a in a_list: - if a.endswith(".jpg"): + if a.endswith((".gif", ".jpeg", ".jpg", ".png")): original_picture = a break self.weibo_pictures.append(original_picture) From 2a357ee2310b6754345ca52cb8c697391902338f Mon Sep 17 00:00:00 2001 From: chenlei Date: Tue, 11 Jun 2019 20:30:40 +0800 Subject: [PATCH 037/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E7=BB=93?= =?UTF-8?q?=E6=9E=9C=E6=96=87=E4=BB=B6=E3=80=81=E5=9B=BE=E7=89=87=E5=AD=98?= =?UTF-8?q?=E5=82=A8=E7=BB=93=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 为每个微博单独创建一个文件夹,名字为用户昵称。文件夹里包含以微博id为名的csv文件和txt文件,还有一个img文件夹来存储图片。 图片名的形式为yyyymmdd+"_"+微博id+"后缀(.jpg或.gif等)"。若某条微博中有多张图片,名字中还会有图片序号。若图片下载失败会在 img文件夹中生成"not_downloaded_pictures.txt",内容为下载失败的原始图片的url Issue #12 --- weiboSpider.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 6588c737..f9faccb7 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -344,14 +344,16 @@ def get_picture_urls(self, info, is_original): print("Error: ", e) traceback.print_exc() - def download_pic(self, url, pic_name): + def download_pic(self, url, pic_path): """下载单张图片""" try: p = requests.get(url) - with open("img/" + pic_name, "wb") as f: + with open(pic_path, "wb") as f: f.write(p.content) except Exception as e: - with open("img/not_downloaded_pictures.txt", "ab") as f: + error_file = self.get_filepath( + "img") + os.sep + "not_downloaded_pictures.txt" + with open(error_file, "ab") as f: url = url + "\n" f.write(url.encode(sys.stdout.encoding)) print("Error: ", e) @@ -361,28 +363,27 @@ def download_pictures(self): """下载微博图片""" try: print(u"即将进行图片下载") - file_dir = os.path.split( - os.path.realpath(__file__))[0] + os.sep + "img" - if not os.path.isdir(file_dir): - os.mkdir(file_dir) + img_dir = self.get_filepath("img") for i, urls in enumerate(tqdm(self.weibo_pictures, desc=u"图片下载进度")): if urls != "无": - pic_prefix = str(self.user_id) + "_" + self.publish_time[ - i][:][:11].replace("-", "") + "_" + self.weibo_id[i] + pic_prefix = self.publish_time[i][:][:11].replace( + "-", "") + "_" + self.weibo_id[i] if "," in urls: urls = urls.split(",") - if isinstance(urls, list): for j, url in enumerate(urls): pic_suffix = url[url.rfind("."):] - pic_name = pic_prefix + "_" + str(j) + pic_suffix - self.download_pic(url, pic_name) + pic_name = pic_prefix + "_" + str(j + + 1) + pic_suffix + pic_path = img_dir + os.sep + pic_name + self.download_pic(url, pic_path) else: pic_suffix = urls[urls.rfind("."):] pic_name = pic_prefix + pic_suffix - self.download_pic(urls, pic_name) + pic_path = img_dir + os.sep + pic_name + self.download_pic(urls, pic_path) print(u"图片下载完毕,保存路径:") - print(file_dir) + print(img_dir) except Exception as e: print("Error: ", e) traceback.print_exc() @@ -414,10 +415,14 @@ def get_one_page(self, page): def get_filepath(self, type): """获取结果文件路径""" try: - file_dir = os.path.split( - os.path.realpath(__file__))[0] + os.sep + "weibo" + file_dir = os.path.split(os.path.realpath( + __file__))[0] + os.sep + "weibo" + os.sep + self.nickname + if type == "img": + file_dir = file_dir + os.sep + "img" if not os.path.isdir(file_dir): - os.mkdir(file_dir) + os.makedirs(file_dir) + if type == "img": + return file_dir file_path = file_dir + os.sep + "%d" % self.user_id + "." + type return file_path except Exception as e: From a883d61b077059937ee5601bd885bb27a1f57f69 Mon Sep 17 00:00:00 2001 From: chenlei Date: Tue, 11 Jun 2019 21:30:08 +0800 Subject: [PATCH 038/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0pic=5Fdownloa?= =?UTF-8?q?d=E5=8F=82=E6=95=B0=E6=8E=A7=E5=88=B6=E6=98=AF=E5=90=A6?= =?UTF-8?q?=E4=B8=8B=E8=BD=BD=E5=BE=AE=E5=8D=9A=E5=8E=9F=E5=A7=8B=E5=9B=BE?= =?UTF-8?q?=E7=89=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 取值为0、1,0代表不下载微博原始图片,1代表下载微博原始图片 Issue #12 --- weiboSpider.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index f9faccb7..f49c3a67 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -19,10 +19,17 @@ class Weibo: cookie = {"Cookie": "your cookie"} # 将your cookie替换成自己的cookie - def __init__(self, user_id, filter=0): + def __init__(self, user_id, filter=0, pic_download=0): """Weibo类初始化""" + if not isinstance(user_id, int): + sys.exit(u"user_id值应为一串数字形式,请重新输入") + if filter != 0 and filter != 1: + sys.exit(u"filter值应为0或1,请重新输入") + if pic_download != 0 and pic_download != 1: + sys.exit(u"pic_download值应为0或1,请重新输入") self.user_id = user_id # 用户id,即需要我们输入的数字,如昵称为"Dear-迪丽热巴"的id为1669879400 self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 + self.pic_download = pic_download # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 self.nickname = "" # 用户昵称,如“Dear-迪丽热巴” self.weibo_num = 0 # 用户全部微博数 self.got_num = 0 # 爬取到的微博数 @@ -585,7 +592,8 @@ def start(self): self.get_weibo_info() print(u"信息抓取完毕") print("*" * 100) - self.download_pictures() + if self.pic_download == 1: + self.download_pictures() except Exception as e: print("Error: ", e) traceback.print_exc() @@ -596,7 +604,8 @@ def main(): # 使用实例,输入一个用户id,所有信息都会存储在wb实例中 user_id = 1669879400 # 可以改成任意合法的用户id(爬虫的微博id除外) filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 - wb = Weibo(user_id, filter) # 调用Weibo类,创建微博实例wb + pic_download = 1 # 值为0代表不下载微博原始图片,1代表下载微博原始图片 + wb = Weibo(user_id, filter, pic_download) # 调用Weibo类,创建微博实例wb wb.start() # 爬取微博信息 print(u"用户昵称: " + wb.nickname) print(u"全部微博数: " + str(wb.weibo_num)) From 8867606ab5b5d232ba806ff52925c19561890262 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 12 Jun 2019 22:58:42 +0800 Subject: [PATCH 039/363] Update README.md --- README.md | 48 ++++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 7eddf3e1..8f2698b8 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,15 @@ # 功能 -爬取新浪微博信息,并写入csv/txt文件,文件名为目标用户id加".csv"和".txt"的形式。
+爬取新浪微博信息,并写入csv/txt文件,文件名为目标用户id加".csv"和".txt"的形式,同时还会下载该微博原始图片(可选)。

-csv文件结果如下所示: -![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv* - +以爬取迪丽热巴的微博为例,她的微博昵称为"Dear-迪丽热巴",id为1669879400(后面会讲如何获取用户id)。我们选择爬取她的原创微博。程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件和一个img文件夹,img文件夹用来存储下载到的图片。
+
+csv文件结果如下所示: +![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv*
txt文件结果如下所示: -![](https://picture.cognize.me/cognize/github/weibospider/weibo_txt.png)*1669879400.txt* - +![](https://picture.cognize.me/cognize/github/weibospider/weibo_txt.png)*1669879400.txt*
+下载的图片如下所示: +![](https://picture.cognize.me/cognize/github/weibospider/picture.png)*img文件夹*
+本次下载了766张图片,大小一共1.15GB,包括她原创微博中的图片和转发微博转发理由中的图片。图片名为yyyymmdd+微博id的形式,若某条微博存在多张图片,则图片名中还包括它在微博图片中的序号。本次下载有一张图片因为超时没有下载下来,该图片url被写到了not_downloaded_pictures.txt。 # 输入 用户id,例如新浪微博昵称为“Dear-迪丽热巴”的id为“1669879400” @@ -15,18 +18,20 @@ txt文件结果如下所示: - 微博数:用户的全部微博数(转发微博+原创微博) - 关注数:用户关注的微博账号数量 - 粉丝数:用户的粉丝数 +- 微博id:以list的形式存储了用户所有微博内容 - 微博内容:以list的形式存储了用户所有微博内容 +- 原始图片url:以list的形式存储了用户所有微博内容,若某条微博存在多张图片,每个url以英文逗号分隔,若没有图片则值为无 - 微博位置:以list的形式存储了用户所有微博的发布位置 - 微博发布时间:以list的形式存储了用户所有微博的发布时间 - 微博对应的点赞数:以list的形式存储了用户所有微博对应的点赞数 - 微博对应的转发数:以list的形式存储了用户所有微博对应的转发数 - 微博对应的评论数:以list的形式存储了用户所有微博对应的评论数 -- 微博发布工具:以list的形式存储了用户所有微博的发布工具,如iPhone客户端、HUAWEI Mate 20 Pro等 +- 微博发布工具:以list的形式存储了用户所有微博的发布工具,如iPhone客户端、HUAWEI Mate 20 Pro等 - 结果文件:保存在当前目录的weibo文件夹里,名字为"user_id.csv"和"user_id.txt"的形式 # 运行环境 - 开发语言:python2/python3 -- 系统: Windows/Linux +- 系统: Windows/Linux/macOS # 使用说明 ## 1.下载脚本 @@ -35,9 +40,9 @@ $ git clone https://github.com/dataabc/weibospider.git ``` 运行上述命令,将本项目下载到当前目录,如果下载成功当前目录会出现一个名为"weibospider"的文件夹; ## 2.设置cookie和user_id -打开weibospider文件夹下的"**weibospider.py**"文件,将“**your cookie**”替换成爬虫微博的cookie,后面会详细讲解如何获取cookie;将**user_id**替换成想要爬取的微博的user_id,后面会详细讲解如何获取user_id; +打开weibospider文件夹下的"**weibospider.py**"文件,将“**your cookie**”替换成爬虫微博的cookie,后面会详细讲解如何获取cookie;将**user_id**替换成想要爬取的微博的user_id,后面会详细讲解如何获取user_id; ## 3.运行脚本 -大家可以根据自己的运行环境选择运行方式,Linux可以通过 +大家可以根据自己的运行环境选择运行方式,Linux可以通过 ```bash $ python weibospider.py ``` @@ -48,30 +53,33 @@ $ python weibospider.py ```python user_id = 1669879400 filter = 1 -wb = Weibo(user_id,filter) #调用Weibo类,创建微博实例wb +pic_download = 1 +wb = Weibo(user_id, filter, pic_download) #调用Weibo类,创建微博实例wb wb.start() #爬取微博信息 ``` -user_id可以改成任意合法的用户id(爬虫的微博id除外);filter默认值为0,表示爬取所有微博信息(转发微博+原创微博),为1表示只爬取用户的所有原创微博;wb是Weibo类的一个实例,也可以是其它名字,只要符合python的命名规范即可;通过执行wb.start() 完成了微博的爬取工作。在上述代码执行后,我们可以得到很多信息:
+user_id可以改成任意合法的用户id(爬虫的微博id除外);filter默认值为0,表示爬取所有微博信息(转发微博+原创微博),为1表示只爬取用户的所有原创微博;pic_download默认值为0,代表不下载微博原始图片,1代表下载;wb是Weibo类的一个实例,也可以是其它名字,只要符合python的命名规范即可;通过执行wb.start() 完成了微博的爬取工作。在上述代码执行后,我们可以得到很多信息:
**wb.nickname**:用户昵称;
**wb.weibo_num**:微博数;
**wb.following**:关注数;
**wb.followers**:粉丝数;
**wb.weibo_content**:存储用户的所有微博,为list形式,若filter=1, wb.weibo_content[0]为最新一条**原创**微博,filter=0为最新一条微博,wb.weibo_content[1]、wb.weibo_content[2]分别表示第二新和第三新的微博,以此类推。当然如果用户没有发过微博,则wb.weibo_content为[];
-**wb.weibo_place**: 存储微博的发布位置,为list形式,如wb.weibo_place[0]为最新一条微博的发布位置,与wb.weibo_content[0]对应,如果该条微博没有位置信息,则weibo_place内容为无,其它用法同wb.weibo_content;
-**wb.publish_time**: 存储微博的发布时间,为list形式,如wb.publish_time[0]为最新一条微博的发布时间,与wb.weibo_content[0]对应,其它用法同wb.weibo_content;
-**wb.up_num**:存储微博获得的点赞数,为list形式,如wb.up_num[0]为最新一条微博获得的点赞数,与wb.weibo_content[0]对应,其它用法同wb.weibo_content;
-**wb.retweet_num**:存储微博获得的转发数,为list形式,如wb.retweet_num[0]为最新一条微博获得的转发数,与wb.weibo_content[0]对应,其它用法同wb.weibo_content;
-**wb.comment_num**:存储微博获得的评论数,为list形式,如wb.comment_num[0]为最新一条微博获得的评论数,与wb.weibo_content[0]对应,其它用法同wb.weibo_content;
-**wb.publish_tool**:存储微博的发布工具,为list形式,如wb.publish_tool[0]为最新一条微博的发布工具,与wb.weibo_content[0]对应,其它用法同wb.weibo_content。 +**weibo_pictures**:存储原创微博的原始图片url和转发微博转发理由中的图片url,为list形式。如wb.weibo_pictures[0]为最新一条微博的图片url,与wb.weibo_content[0]对应。若该条微博有多张图片,则wb.weibo_pictures[0]存储多个url,以英文逗号分割;若该微博没有图片,则值为"无",其它用法同wb.weibo_content;
+**retweet_pictures**:存储被转发微博中的原始图片url,为list形式。当最新微博为原创微博或者为没有图片的转发微博时,则wb.retweet_pictures[0]值为"无",否则为最新一条被转发微博的图片url,与wb.weibo_content[0]对应。若有多张图片,则wb.retweet_pictures[0]存储多个url,以英文逗号分割,其它用法同wb.weibo_content;
+**wb.weibo_place**: 存储微博的发布位置,为list形式。如wb.weibo_place[0]为最新一条微博的发布位置,与wb.weibo_content[0]对应。如果该条微博没有位置信息,则weibo_place值为"无",其它用法同wb.weibo_content;
+**wb.publish_time**: 存储微博的发布时间,为list形式。如wb.publish_time[0]为最新一条微博的发布时间,与wb.weibo_content[0]对应。其它用法同wb.weibo_content;
+**wb.up_num**:存储微博获得的点赞数,为list形式。如wb.up_num[0]为最新一条微博获得的点赞数,与wb.weibo_content[0]对应。其它用法同wb.weibo_content;
+**wb.retweet_num**:存储微博获得的转发数,为list形式。如wb.retweet_num[0]为最新一条微博获得的转发数,与wb.weibo_content[0]对应。其它用法同wb.weibo_content;
+**wb.comment_num**:存储微博获得的评论数,为list形式。如wb.comment_num[0]为最新一条微博获得的评论数,与wb.weibo_content[0]对应。其它用法同wb.weibo_content;
+**wb.publish_tool**:存储微博的发布工具,为list形式。如wb.publish_tool[0]为最新一条微博的发布工具,与wb.weibo_content[0]对应。其它用法同wb.weibo_content。 # 如何获取cookie 1.用Chrome打开
2.输入微博的用户名、密码,登录,如图所示: ![](https://picture.cognize.me/cognize/github/weibospider/cookie1.png) 登录成功后会跳转到;
-3.按F12键打开Chrome开发者工具,在地址栏输入并跳转到,跳转后会显示如下类似界面: +3.按F12键打开Chrome开发者工具,在地址栏输入并跳转到,跳转后会显示如下类似界面: ![](https://picture.cognize.me/cognize/github/weibospider/cookie2.png) -4.点击Chrome开发者工具“Name"列表中的"weibo.cn",点击"Headers",其中"Request Headers"下,"Cookie"后的值即为我们要找的cookie值,复制即可,如图所示: +4.依此点击Chrome开发者工具中的Network->Name中的weibo.cn->Headers->Request Headers,"Cookie:"后的值即为我们要找的cookie值,复制即可,如图所示: ![](https://picture.cognize.me/cognize/github/weibospider/cookie3.png) # 如何获取user_id From c414926d8fe4242961413e2e91df12db47f144e9 Mon Sep 17 00:00:00 2001 From: chenlei Date: Fri, 14 Jun 2019 22:56:07 +0800 Subject: [PATCH 040/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E5=BE=AE?= =?UTF-8?q?=E5=8D=9A=E5=AD=98=E5=82=A8=E7=BB=93=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 将微博id、正文、图片url、发布位置、发布时间、发布工具、点赞数、转发数、评论数等存储在self.weibo中 --- weiboSpider.py | 227 +++++++++++++++++++++++-------------------------- 1 file changed, 105 insertions(+), 122 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index f49c3a67..4a798dd1 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -8,6 +8,7 @@ import re import sys import traceback +from collections import OrderedDict from datetime import datetime, timedelta from time import sleep @@ -35,17 +36,7 @@ def __init__(self, user_id, filter=0, pic_download=0): self.got_num = 0 # 爬取到的微博数 self.following = 0 # 用户关注数 self.followers = 0 # 用户粉丝数 - self.weibo_id = [] # 微博id - self.weibo_content = [] # 微博内容 - self.weibo_pictures = [] # 微博原始图片的url,包括原创微博的原始图片url和转发微博"转发理由"中图片的url - self.retweet_pictures = [] # 被转发微博中原始图片的url - self.original = [] # 是否为原创微博 - self.weibo_place = [] # 微博位置 - self.publish_time = [] # 微博发布时间 - self.up_num = [] # 微博对应的点赞数 - self.retweet_num = [] # 微博对应的转发数 - self.comment_num = [] # 微博对应的评论数 - self.publish_tool = [] # 微博发布工具 + self.weibo = [] # 存储爬取到的所有微博信息 def deal_html(self, url): """处理html""" @@ -120,9 +111,9 @@ def get_long_weibo(self, weibo_link): info = selector.xpath("//div[@class='c']")[1] wb_content = self.deal_garbled(info) wb_time = info.xpath("//span[@class='ct']/text()")[0] - wb_content = wb_content[wb_content.find(":") + - 1:wb_content.rfind(wb_time)] - return wb_content + weibo_content = wb_content[wb_content.find(":") + + 1:wb_content.rfind(wb_time)] + return weibo_content except Exception as e: print("Error: ", e) traceback.print_exc() @@ -147,8 +138,8 @@ def get_long_retweet(self, weibo_link): """获取长转发微博""" try: wb_content = self.get_long_weibo(weibo_link) - wb_content = wb_content[:wb_content.rfind(u"原文转发")] - return wb_content + weibo_content = wb_content[:wb_content.rfind(u"原文转发")] + return weibo_content except Exception as e: print("Error: ", e) traceback.print_exc() @@ -193,39 +184,38 @@ def get_weibo_content(self, info, is_original): """获取微博内容""" try: weibo_id = info.xpath("@id")[0][2:] - self.weibo_id.append(weibo_id) if is_original: weibo_content = self.get_original_weibo(info, weibo_id) else: weibo_content = self.get_retweet(info, weibo_id) - self.weibo_content.append(weibo_content) print(weibo_content) + return weibo_content except Exception as e: print("Error: ", e) traceback.print_exc() - def get_weibo_place(self, info): + def get_publish_place(self, info): """获取微博发布位置""" try: div_first = info.xpath("div")[0] a_list = div_first.xpath("a") - weibo_place = u"无" + publish_place = u"无" for a in a_list: if ("place.weibo.com" in a.xpath("@href")[0] and a.xpath("text()")[0] == u"显示地图"): weibo_a = div_first.xpath("span[@class='ctt']/a") if len(weibo_a) >= 1: - weibo_place = weibo_a[-1] + publish_place = weibo_a[-1] if (u"视频" == div_first.xpath( "span[@class='ctt']/a/text()")[-1][-2:]): if len(weibo_a) >= 2: - weibo_place = weibo_a[-2] + publish_place = weibo_a[-2] else: - weibo_place = u"无" - weibo_place = self.deal_garbled(weibo_place) + publish_place = u"无" + publish_place = self.deal_garbled(publish_place) break - self.weibo_place.append(weibo_place) - print(u"微博位置: " + weibo_place) + print(u"微博发布位置: " + publish_place) + return publish_place except Exception as e: print("Error: ", e) traceback.print_exc() @@ -255,8 +245,8 @@ def get_publish_time(self, info): publish_time = year + "-" + month + "-" + day + " " + time else: publish_time = publish_time[:16] - self.publish_time.append(publish_time) print(u"微博发布时间: " + publish_time) + return publish_time except Exception as e: print("Error: ", e) traceback.print_exc() @@ -270,8 +260,8 @@ def get_publish_tool(self, info): publish_tool = str_time.split(u"来自")[1] else: publish_tool = u"无" - self.publish_tool.append(publish_tool) print(u"微博发布工具: " + publish_tool) + return publish_tool except Exception as e: print("Error: ", e) traceback.print_exc() @@ -279,6 +269,7 @@ def get_publish_tool(self, info): def get_weibo_footer(self, info): """获取微博点赞数、转发数、评论数""" try: + footer = {} pattern = r"\d+" str_footer = info.xpath("div")[-1] str_footer = self.deal_garbled(str_footer) @@ -286,16 +277,17 @@ def get_weibo_footer(self, info): weibo_footer = re.findall(pattern, str_footer, re.M) up_num = int(weibo_footer[0]) - self.up_num.append(up_num) print(u"点赞数: " + str(up_num)) + footer["up_num"] = up_num retweet_num = int(weibo_footer[1]) - self.retweet_num.append(retweet_num) print(u"转发数: " + str(retweet_num)) + footer["retweet_num"] = retweet_num comment_num = int(weibo_footer[2]) - self.comment_num.append(comment_num) print(u"评论数: " + str(comment_num)) + footer["comment_num"] = comment_num + return footer except Exception as e: print("Error: ", e) traceback.print_exc() @@ -330,23 +322,25 @@ def get_picture_urls(self, info, is_original): """获取微博原始图片url""" try: weibo_id = info.xpath("@id")[0][2:] + picture_urls = {} if is_original: original_pictures = self.extract_picture_urls(info, weibo_id) - self.weibo_pictures.append(original_pictures) + picture_urls["original_pictures"] = original_pictures if not self.filter: - self.retweet_pictures.append("无") + picture_urls["retweet_pictures"] = "无" else: retweet_url = info.xpath("div/a[@class='cc']/@href")[0] retweet_id = retweet_url.split("/")[-1].split("?")[0] retweet_pictures = self.extract_picture_urls(info, retweet_id) - self.retweet_pictures.append(retweet_pictures) + picture_urls["retweet_pictures"] = retweet_pictures a_list = info.xpath("div[last()]/a/@href") original_picture = "无" for a in a_list: if a.endswith((".gif", ".jpeg", ".jpg", ".png")): original_picture = a break - self.weibo_pictures.append(original_picture) + picture_urls["original_pictures"] = original_picture + return picture_urls except Exception as e: print("Error: ", e) traceback.print_exc() @@ -371,48 +365,73 @@ def download_pictures(self): try: print(u"即将进行图片下载") img_dir = self.get_filepath("img") - for i, urls in enumerate(tqdm(self.weibo_pictures, - desc=u"图片下载进度")): - if urls != "无": - pic_prefix = self.publish_time[i][:][:11].replace( - "-", "") + "_" + self.weibo_id[i] - if "," in urls: - urls = urls.split(",") - for j, url in enumerate(urls): + for w in tqdm(self.weibo, desc=u"图片下载进度"): + if w["original_pictures"] != "无": + pic_prefix = w["publish_time"][:11].replace( + "-", "") + "_" + w["id"] + if "," in w["original_pictures"]: + w["original_pictures"] = w["original_pictures"].split( + ",") + for j, url in enumerate(w["original_pictures"]): pic_suffix = url[url.rfind("."):] pic_name = pic_prefix + "_" + str(j + 1) + pic_suffix pic_path = img_dir + os.sep + pic_name self.download_pic(url, pic_path) else: - pic_suffix = urls[urls.rfind("."):] + pic_suffix = w["original_pictures"][ + w["original_pictures"].rfind("."):] pic_name = pic_prefix + pic_suffix pic_path = img_dir + os.sep + pic_name - self.download_pic(urls, pic_path) + self.download_pic(w["original_pictures"], pic_path) print(u"图片下载完毕,保存路径:") print(img_dir) except Exception as e: print("Error: ", e) traceback.print_exc() + def get_one_weibo(self, info): + """获取一条微博的全部信息""" + try: + weibo = OrderedDict() + is_original = self.is_original(info) + if (not self.filter) or is_original: + weibo["id"] = info.xpath("@id")[0][2:] + weibo["content"] = self.get_weibo_content(info, + is_original) # 微博内容 + picture_urls = self.get_picture_urls(info, is_original) + weibo["original_pictures"] = picture_urls[ + "original_pictures"] # 原创图片url + if not self.filter: + weibo["retweet_pictures"] = picture_urls[ + "retweet_pictures"] # 转发图片url + weibo["original"] = is_original # 是否原创微博 + weibo["publish_place"] = self.get_publish_place(info) # 微博发布位置 + weibo["publish_time"] = self.get_publish_time(info) # 微博发布时间 + weibo["publish_tool"] = self.get_publish_tool(info) # 微博发布工具 + footer = self.get_weibo_footer(info) + weibo["up_num"] = footer["up_num"] # 微博点赞数 + weibo["retweet_num"] = footer["retweet_num"] # 转发数 + weibo["comment_num"] = footer["comment_num"] # 评论数 + else: + weibo = None + return weibo + except Exception as e: + print("Error: ", e) + traceback.print_exc() + def get_one_page(self, page): """获取第page页的全部微博""" try: url = "https://weibo.cn/u/%d?page=%d" % (self.user_id, page) selector = self.deal_html(url) info = selector.xpath("//div[@class='c']") - is_empty = info[0].xpath("div/span[@class='ctt']") - if is_empty: + is_exist = info[0].xpath("div/span[@class='ctt']") + if is_exist: for i in range(0, len(info) - 2): - is_original = self.is_original(info[i]) - self.original.append(is_original) - if (not self.filter) or is_original: - self.get_weibo_content(info[i], is_original) # 微博内容 - self.get_picture_urls(info[i], is_original) # 微博图片url - self.get_weibo_place(info[i]) # 微博位置 - self.get_publish_time(info[i]) # 微博发布时间 - self.get_publish_tool(info[i]) # 微博发布工具 - self.get_weibo_footer(info[i]) # 微博点赞数、转发数、评论数 + weibo = self.get_one_weibo(info[i]) + if weibo: + self.weibo.append(weibo) self.got_num += 1 print("-" * 100) except Exception as e: @@ -439,56 +458,21 @@ def get_filepath(self, type): def write_csv(self, wrote_num): """将爬取的信息写入csv文件""" try: - if self.filter: - result_headers = [ - "微博id", - "微博正文", - "原始图片url", - "发布位置", - "发布时间", - "发布工具", - "点赞数", - "转发数", - "评论数", - ] - result_data = zip( - self.weibo_id[wrote_num:], - self.weibo_content[wrote_num:], - self.weibo_pictures[wrote_num:], - self.weibo_place[wrote_num:], - self.publish_time[wrote_num:], - self.publish_tool[wrote_num:], - self.up_num[wrote_num:], - self.retweet_num[wrote_num:], - self.comment_num[wrote_num:], - ) - else: - result_headers = [ - "微博id", - "微博正文", - "原始图片url", - "被转发微博原始图片url", - "是否为原创微博", - "发布位置", - "发布时间", - "发布工具", - "点赞数", - "转发数", - "评论数", - ] - result_data = zip( - self.weibo_id[wrote_num:], - self.weibo_content[wrote_num:], - self.weibo_pictures[wrote_num:], - self.retweet_pictures[wrote_num:], - self.original[wrote_num:], - self.weibo_place[wrote_num:], - self.publish_time[wrote_num:], - self.publish_tool[wrote_num:], - self.up_num[wrote_num:], - self.retweet_num[wrote_num:], - self.comment_num[wrote_num:], - ) + result_headers = [ + "微博id", + "微博正文", + "原始图片url", + "发布位置", + "发布时间", + "发布工具", + "点赞数", + "转发数", + "评论数", + ] + if not self.filter: + result_headers.insert(3, "被转发微博原始图片url") + result_headers.insert(4, "是否为原创微博") + result_data = [w.values() for w in self.weibo][wrote_num:] if sys.version < "3": # python2.x reload(sys) sys.setdefaultencoding("utf-8") @@ -528,15 +512,14 @@ def write_txt(self, wrote_num): str(self.following) + u"\n粉丝数: " + str(self.followers) + result_header) temp_result.append(result_header) - for i in range(wrote_num, self.got_num): + for i, w in enumerate(self.weibo[wrote_num:]): temp_result.append( - str(i + 1) + ":" + self.weibo_content[i] + "\n" + - u"微博位置: " + self.weibo_place[i] + "\n" + u"发布时间: " + - self.publish_time[i] + "\n" + u"点赞数: " + - str(self.up_num[i]) + u" 转发数: " + - str(self.retweet_num[i]) + u" 评论数: " + - str(self.comment_num[i]) + "\n" + u"发布工具: " + - self.publish_tool[i] + "\n\n") + str(wrote_num + i + 1) + ":" + w["content"] + "\n" + + u"微博位置: " + w["publish_place"] + "\n" + u"发布时间: " + + w["publish_time"] + "\n" + u"点赞数: " + str(w["up_num"]) + + u" 转发数: " + str(w["retweet_num"]) + u" 评论数: " + + str(w["comment_num"]) + "\n" + u"发布工具: " + + w["publish_tool"] + "\n\n") result = "".join(temp_result) with open(self.get_filepath("txt"), "ab") as f: f.write(result.encode(sys.stdout.encoding)) @@ -611,14 +594,14 @@ def main(): print(u"全部微博数: " + str(wb.weibo_num)) print(u"关注数: " + str(wb.following)) print(u"粉丝数: " + str(wb.followers)) - if wb.weibo_content: - print(u"最新/置顶 微博为: " + wb.weibo_content[0]) - print(u"最新/置顶 微博位置: " + wb.weibo_place[0]) - print(u"最新/置顶 微博发布时间: " + wb.publish_time[0]) - print(u"最新/置顶 微博获得赞数: " + str(wb.up_num[0])) - print(u"最新/置顶 微博获得转发数: " + str(wb.retweet_num[0])) - print(u"最新/置顶 微博获得评论数: " + str(wb.comment_num[0])) - print(u"最新/置顶 微博发布工具: " + wb.publish_tool[0]) + if wb.weibo: + print(u"最新/置顶 微博为: " + wb.weibo[0]["content"]) + print(u"最新/置顶 微博位置: " + wb.weibo[0]["publish_place"]) + print(u"最新/置顶 微博发布时间: " + wb.weibo[0]["publish_time"]) + print(u"最新/置顶 微博获得赞数: " + str(wb.weibo[0]["up_num"])) + print(u"最新/置顶 微博获得转发数: " + str(wb.weibo[0]["retweet_num"])) + print(u"最新/置顶 微博获得评论数: " + str(wb.weibo[0]["comment_num"])) + print(u"最新/置顶 微博发布工具: " + wb.weibo[0]["publish_tool"]) except Exception as e: print("Error: ", e) traceback.print_exc() From 96b606769debecbcc4250f8e830f186e64b29d35 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sat, 15 Jun 2019 01:47:25 +0800 Subject: [PATCH 041/363] Update README.md --- README.md | 48 +++++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 8f2698b8..9095c642 100644 --- a/README.md +++ b/README.md @@ -9,25 +9,27 @@ txt文件结果如下所示: ![](https://picture.cognize.me/cognize/github/weibospider/weibo_txt.png)*1669879400.txt*
下载的图片如下所示: ![](https://picture.cognize.me/cognize/github/weibospider/picture.png)*img文件夹*
-本次下载了766张图片,大小一共1.15GB,包括她原创微博中的图片和转发微博转发理由中的图片。图片名为yyyymmdd+微博id的形式,若某条微博存在多张图片,则图片名中还包括它在微博图片中的序号。本次下载有一张图片因为超时没有下载下来,该图片url被写到了not_downloaded_pictures.txt。 +本次下载了766张图片,大小一共1.15GB,包括她原创微博中的图片和转发微博转发理由中的图片。图片名为yyyymmdd+微博id的形式,若某条微博存在多张图片,则图片名中还会包括它在微博图片中的序号。本次下载有一张图片因为超时没有下载下来,该图片url被写到了not_downloaded_pictures.txt。 + # 输入 用户id,例如新浪微博昵称为“Dear-迪丽热巴”的id为“1669879400” # 输出 - 昵称:用户昵称,如"Dear-迪丽热巴" - 微博数:用户的全部微博数(转发微博+原创微博) -- 关注数:用户关注的微博账号数量 +- 关注数:用户关注的微博数量 - 粉丝数:用户的粉丝数 -- 微博id:以list的形式存储了用户所有微博内容 -- 微博内容:以list的形式存储了用户所有微博内容 -- 原始图片url:以list的形式存储了用户所有微博内容,若某条微博存在多张图片,每个url以英文逗号分隔,若没有图片则值为无 -- 微博位置:以list的形式存储了用户所有微博的发布位置 -- 微博发布时间:以list的形式存储了用户所有微博的发布时间 -- 微博对应的点赞数:以list的形式存储了用户所有微博对应的点赞数 -- 微博对应的转发数:以list的形式存储了用户所有微博对应的转发数 -- 微博对应的评论数:以list的形式存储了用户所有微博对应的评论数 -- 微博发布工具:以list的形式存储了用户所有微博的发布工具,如iPhone客户端、HUAWEI Mate 20 Pro等 -- 结果文件:保存在当前目录的weibo文件夹里,名字为"user_id.csv"和"user_id.txt"的形式 +- 微博id:微博唯一标志 +- 微博内容:微博正文 +- 原始图片url:原创微博图片和转发微博转发理由中图片的url,若某条微博存在多张图片,每个url以英文逗号分隔,若没有图片则值为无 +- 微博发布位置:位置微博中的发布位置 +- 微博发布时间:微博发布时的时间,精确到分 +- 点赞数:微博被赞的数量 +- 转发数:微博被转发的数量 +- 评论数:微博被评论的数量 +- 微博发布工具:微博的发布工具,如iPhone客户端、HUAWEI Mate 20 Pro等 +- 结果文件:保存在当前目录weibo文件夹下以用户昵称为名的文件夹里,名字为"user_id.csv"和"user_id.txt"的形式 +- 微博图片:原创微博中的图片和转发微博转发理由中的图片,保存在以用户昵称为名的文件夹下的img文件夹里 # 运行环境 - 开发语言:python2/python3 @@ -62,15 +64,19 @@ user_id可以改成任意合法的用户id(爬虫的微博id除外);filter **wb.weibo_num**:微博数;
**wb.following**:关注数;
**wb.followers**:粉丝数;
-**wb.weibo_content**:存储用户的所有微博,为list形式,若filter=1, wb.weibo_content[0]为最新一条**原创**微博,filter=0为最新一条微博,wb.weibo_content[1]、wb.weibo_content[2]分别表示第二新和第三新的微博,以此类推。当然如果用户没有发过微博,则wb.weibo_content为[];
-**weibo_pictures**:存储原创微博的原始图片url和转发微博转发理由中的图片url,为list形式。如wb.weibo_pictures[0]为最新一条微博的图片url,与wb.weibo_content[0]对应。若该条微博有多张图片,则wb.weibo_pictures[0]存储多个url,以英文逗号分割;若该微博没有图片,则值为"无",其它用法同wb.weibo_content;
-**retweet_pictures**:存储被转发微博中的原始图片url,为list形式。当最新微博为原创微博或者为没有图片的转发微博时,则wb.retweet_pictures[0]值为"无",否则为最新一条被转发微博的图片url,与wb.weibo_content[0]对应。若有多张图片,则wb.retweet_pictures[0]存储多个url,以英文逗号分割,其它用法同wb.weibo_content;
-**wb.weibo_place**: 存储微博的发布位置,为list形式。如wb.weibo_place[0]为最新一条微博的发布位置,与wb.weibo_content[0]对应。如果该条微博没有位置信息,则weibo_place值为"无",其它用法同wb.weibo_content;
-**wb.publish_time**: 存储微博的发布时间,为list形式。如wb.publish_time[0]为最新一条微博的发布时间,与wb.weibo_content[0]对应。其它用法同wb.weibo_content;
-**wb.up_num**:存储微博获得的点赞数,为list形式。如wb.up_num[0]为最新一条微博获得的点赞数,与wb.weibo_content[0]对应。其它用法同wb.weibo_content;
-**wb.retweet_num**:存储微博获得的转发数,为list形式。如wb.retweet_num[0]为最新一条微博获得的转发数,与wb.weibo_content[0]对应。其它用法同wb.weibo_content;
-**wb.comment_num**:存储微博获得的评论数,为list形式。如wb.comment_num[0]为最新一条微博获得的评论数,与wb.weibo_content[0]对应。其它用法同wb.weibo_content;
-**wb.publish_tool**:存储微博的发布工具,为list形式。如wb.publish_tool[0]为最新一条微博的发布工具,与wb.weibo_content[0]对应。其它用法同wb.weibo_content。 +**wb.weibo**:除不包含上述信息外,wb.weibo包含爬取到的所有微博信息,如**微博id**、**微博正文**、**原始图片url**、**发布位置**、**发布时间**、**发布工具**、**点赞数**、**转发数**、**评论数**等。如果爬的是全部微博(原创+转发),除上述信息之外,还包含被**转发微博原始图片url**、**是否为原创微博**等。wb.weibo是一个列表,包含了爬取的所有微博信息。wb.weibo[0]为爬取的第一条微博,wb.weibo[1]为爬取的第二条微博,以此类推。当filter=1时,wb.weibo[0]为爬取的第一条**原创**微博,以此类推。wb.weibo[0]["id"]为第一条微博的id,wb.weibo[0]["content"]为第一条微博的正文,wb.weibo[0]["publish_time"]为第一条微博的发布时间,还有其它很多信息不在赘述,大家可以点击下面的"详情"查看具体用法。 +
+ +详情 +**original_pictures**:存储原创微博的原始图片url和转发微博转发理由中的图片url。如wb.weibo[0]["original_pictures"]为最新一条微博的原始图片url,若该条微博有多张图片,则存储多个url,以英文逗号分割;若该微博没有图片,则值为"无";
+**retweet_pictures**:存储被转发微博中的原始图片url。当最新微博为原创微博或者为没有图片的转发微博时,则值为"无",否则为被转发微博的图片url。若有多张图片,则存储多个url,以英文逗号分割;
+**publish_place**:存储微博的发布位置。如wb.weibo[0]["publish_place"]为最新一条微博的发布位置,如果该条微博没有位置信息,则值为"无";
+**publish_time**:存储微博的发布时间。如wb.weibo[0]["publish_time"]为最新一条微博的发布时间;
+**up_num**:存储微博获得的点赞数。如wb.weibo[0]["up_num"]为最新一条微博获得的点赞数;
+**retweet_num**:存储微博获得的转发数。如wb.weibo[0]["retweet_num"]为最新一条微博获得的转发数;
+**comment_num**:存储微博获得的评论数。如wb.weibo[0]["comment_num"]为最新一条微博获得的评论数;
+**publish_tool**:存储微博的发布工具。如wb.weibo[0]["publish_tool"]为最新一条微博的发布工具。 +
# 如何获取cookie 1.用Chrome打开
From 409973910365cc01908a922cb660c578608c6dfa Mon Sep 17 00:00:00 2001 From: chenlei Date: Wed, 19 Jun 2019 21:30:39 +0800 Subject: [PATCH 042/363] =?UTF-8?q?style:=20=E7=BB=9F=E4=B8=80=E5=BC=95?= =?UTF-8?q?=E5=8F=B7=E7=9A=84=E4=BD=BF=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 以使用单引号为主 --- weiboSpider.py | 416 ++++++++++++++++++++++++------------------------- 1 file changed, 208 insertions(+), 208 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 4a798dd1..109ea8c0 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -17,21 +17,21 @@ from tqdm import tqdm -class Weibo: - cookie = {"Cookie": "your cookie"} # 将your cookie替换成自己的cookie +class Weibo(object): + cookie = {'Cookie': 'your cookie'} # 将your cookie替换成自己的cookie def __init__(self, user_id, filter=0, pic_download=0): """Weibo类初始化""" if not isinstance(user_id, int): - sys.exit(u"user_id值应为一串数字形式,请重新输入") + sys.exit(u'user_id值应为一串数字形式,请重新输入') if filter != 0 and filter != 1: - sys.exit(u"filter值应为0或1,请重新输入") + sys.exit(u'filter值应为0或1,请重新输入') if pic_download != 0 and pic_download != 1: - sys.exit(u"pic_download值应为0或1,请重新输入") + sys.exit(u'pic_download值应为0或1,请重新输入') self.user_id = user_id # 用户id,即需要我们输入的数字,如昵称为"Dear-迪丽热巴"的id为1669879400 self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 self.pic_download = pic_download # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 - self.nickname = "" # 用户昵称,如“Dear-迪丽热巴” + self.nickname = '' # 用户昵称,如“Dear-迪丽热巴” self.weibo_num = 0 # 用户全部微博数 self.got_num = 0 # 爬取到的微博数 self.following = 0 # 用户关注数 @@ -45,31 +45,31 @@ def deal_html(self, url): selector = etree.HTML(html) return selector except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def deal_garbled(self, info): """处理乱码""" try: - info = (info.xpath("string(.)").replace(u"\u200b", "").encode( - sys.stdout.encoding, "ignore").decode(sys.stdout.encoding)) + info = (info.xpath('string(.)').replace(u'\u200b', '').encode( + sys.stdout.encoding, 'ignore').decode(sys.stdout.encoding)) return info except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_nickname(self): """获取用户昵称""" try: - url = "https://weibo.cn/%d/info" % (self.user_id) + url = 'https://weibo.cn/%d/info' % (self.user_id) selector = self.deal_html(url) - nickname = selector.xpath("//title/text()")[0] + nickname = selector.xpath('//title/text()')[0] self.nickname = nickname[:-3] - if self.nickname == u"登录 - 新" or self.nickname == u"新浪": - sys.exit(u"cookie错误或已过期,请按照README中方法重新获取") - print(u"用户昵称: " + self.nickname) + if self.nickname == u'登录 - 新' or self.nickname == u'新浪': + sys.exit(u'cookie错误或已过期,请按照README中方法重新获取') + print(u'用户昵称: ' + self.nickname) except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_user_info(self, selector): @@ -79,16 +79,16 @@ def get_user_info(self, selector): user_info = selector.xpath("//div[@class='tip2']/*/text()") self.weibo_num = int(user_info[0][3:-1]) - print(u"微博数: " + str(self.weibo_num)) + print(u'微博数: ' + str(self.weibo_num)) self.following = int(user_info[1][3:-1]) - print(u"关注数: " + str(self.following)) + print(u'关注数: ' + str(self.following)) self.followers = int(user_info[2][3:-1]) - print(u"粉丝数: " + str(self.followers)) - print("*" * 100) + print(u'粉丝数: ' + str(self.followers)) + print('*' * 100) except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_page_num(self, selector): @@ -98,10 +98,10 @@ def get_page_num(self, selector): page_num = 1 else: page_num = (int)( - selector.xpath("//input[@name='mp']")[0].attrib["value"]) + selector.xpath("//input[@name='mp']")[0].attrib['value']) return page_num except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_long_weibo(self, weibo_link): @@ -111,37 +111,37 @@ def get_long_weibo(self, weibo_link): info = selector.xpath("//div[@class='c']")[1] wb_content = self.deal_garbled(info) wb_time = info.xpath("//span[@class='ct']/text()")[0] - weibo_content = wb_content[wb_content.find(":") + + weibo_content = wb_content[wb_content.find(':') + 1:wb_content.rfind(wb_time)] return weibo_content except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_original_weibo(self, info, weibo_id): """获取原创微博""" try: weibo_content = self.deal_garbled(info) - weibo_content = weibo_content[:weibo_content.rfind(u"赞")] - a_text = info.xpath("div//a/text()") - if u"全文" in a_text: - weibo_link = "https://weibo.cn/comment/" + weibo_id + weibo_content = weibo_content[:weibo_content.rfind(u'赞')] + a_text = info.xpath('div//a/text()') + if u'全文' in a_text: + weibo_link = 'https://weibo.cn/comment/' + weibo_id wb_content = self.get_long_weibo(weibo_link) if wb_content: weibo_content = wb_content return weibo_content except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_long_retweet(self, weibo_link): """获取长转发微博""" try: wb_content = self.get_long_weibo(weibo_link) - weibo_content = wb_content[:wb_content.rfind(u"原文转发")] + weibo_content = wb_content[:wb_content.rfind(u'原文转发')] return weibo_content except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_retweet(self, info, weibo_id): @@ -149,27 +149,27 @@ def get_retweet(self, info, weibo_id): try: original_user = info.xpath("div/span[@class='cmt']/a/text()") if not original_user: - wb_content = u"转发微博已被删除" + wb_content = u'转发微博已被删除' return wb_content else: original_user = original_user[0] wb_content = self.deal_garbled(info) - wb_content = wb_content[wb_content.find(":") + - 1:wb_content.rfind(u"赞")] - wb_content = wb_content[:wb_content.rfind(u"赞")] - a_text = info.xpath("div//a/text()") - if u"全文" in a_text: - weibo_link = "https://weibo.cn/comment/" + weibo_id + wb_content = wb_content[wb_content.find(':') + + 1:wb_content.rfind(u'赞')] + wb_content = wb_content[:wb_content.rfind(u'赞')] + a_text = info.xpath('div//a/text()') + if u'全文' in a_text: + weibo_link = 'https://weibo.cn/comment/' + weibo_id weibo_content = self.get_long_retweet(weibo_link) if weibo_content: wb_content = weibo_content - retweet_reason = self.deal_garbled(info.xpath("div")[-1]) - retweet_reason = retweet_reason[:retweet_reason.rindex(u"赞")] - wb_content = (retweet_reason + "\n" + u"原始用户: " + original_user + - "\n" + u"转发内容: " + wb_content) + retweet_reason = self.deal_garbled(info.xpath('div')[-1]) + retweet_reason = retweet_reason[:retweet_reason.rindex(u'赞')] + wb_content = (retweet_reason + '\n' + u'原始用户: ' + original_user + + '\n' + u'转发内容: ' + wb_content) return wb_content except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def is_original(self, info): @@ -183,7 +183,7 @@ def is_original(self, info): def get_weibo_content(self, info, is_original): """获取微博内容""" try: - weibo_id = info.xpath("@id")[0][2:] + weibo_id = info.xpath('@id')[0][2:] if is_original: weibo_content = self.get_original_weibo(info, weibo_id) else: @@ -191,33 +191,33 @@ def get_weibo_content(self, info, is_original): print(weibo_content) return weibo_content except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_publish_place(self, info): """获取微博发布位置""" try: - div_first = info.xpath("div")[0] - a_list = div_first.xpath("a") - publish_place = u"无" + div_first = info.xpath('div')[0] + a_list = div_first.xpath('a') + publish_place = u'无' for a in a_list: - if ("place.weibo.com" in a.xpath("@href")[0] - and a.xpath("text()")[0] == u"显示地图"): + if ('place.weibo.com' in a.xpath('@href')[0] + and a.xpath('text()')[0] == u'显示地图'): weibo_a = div_first.xpath("span[@class='ctt']/a") if len(weibo_a) >= 1: publish_place = weibo_a[-1] - if (u"视频" == div_first.xpath( + if (u'视频' == div_first.xpath( "span[@class='ctt']/a/text()")[-1][-2:]): if len(weibo_a) >= 2: publish_place = weibo_a[-2] else: - publish_place = u"无" + publish_place = u'无' publish_place = self.deal_garbled(publish_place) break - print(u"微博发布位置: " + publish_place) + print(u'微博发布位置: ' + publish_place) return publish_place except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_publish_time(self, info): @@ -225,30 +225,30 @@ def get_publish_time(self, info): try: str_time = info.xpath("div/span[@class='ct']") str_time = self.deal_garbled(str_time[0]) - publish_time = str_time.split(u"来自")[0] - if u"刚刚" in publish_time: - publish_time = datetime.now().strftime("%Y-%m-%d %H:%M") - elif u"分钟" in publish_time: - minute = publish_time[:publish_time.find(u"分钟")] + publish_time = str_time.split(u'来自')[0] + if u'刚刚' in publish_time: + publish_time = datetime.now().strftime('%Y-%m-%d %H:%M') + elif u'分钟' in publish_time: + minute = publish_time[:publish_time.find(u'分钟')] minute = timedelta(minutes=int(minute)) publish_time = (datetime.now() - - minute).strftime("%Y-%m-%d %H:%M") - elif u"今天" in publish_time: - today = datetime.now().strftime("%Y-%m-%d") + minute).strftime('%Y-%m-%d %H:%M') + elif u'今天' in publish_time: + today = datetime.now().strftime('%Y-%m-%d') time = publish_time[3:] - publish_time = today + " " + time - elif u"月" in publish_time: - year = datetime.now().strftime("%Y") + publish_time = today + ' ' + time + elif u'月' in publish_time: + year = datetime.now().strftime('%Y') month = publish_time[0:2] day = publish_time[3:5] time = publish_time[7:12] - publish_time = year + "-" + month + "-" + day + " " + time + publish_time = year + '-' + month + '-' + day + ' ' + time else: publish_time = publish_time[:16] - print(u"微博发布时间: " + publish_time) + print(u'微博发布时间: ' + publish_time) return publish_time except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_publish_tool(self, info): @@ -256,138 +256,138 @@ def get_publish_tool(self, info): try: str_time = info.xpath("div/span[@class='ct']") str_time = self.deal_garbled(str_time[0]) - if len(str_time.split(u"来自")) > 1: - publish_tool = str_time.split(u"来自")[1] + if len(str_time.split(u'来自')) > 1: + publish_tool = str_time.split(u'来自')[1] else: - publish_tool = u"无" - print(u"微博发布工具: " + publish_tool) + publish_tool = u'无' + print(u'微博发布工具: ' + publish_tool) return publish_tool except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_weibo_footer(self, info): """获取微博点赞数、转发数、评论数""" try: footer = {} - pattern = r"\d+" - str_footer = info.xpath("div")[-1] + pattern = r'\d+' + str_footer = info.xpath('div')[-1] str_footer = self.deal_garbled(str_footer) - str_footer = str_footer[str_footer.rfind(u"赞"):] + str_footer = str_footer[str_footer.rfind(u'赞'):] weibo_footer = re.findall(pattern, str_footer, re.M) up_num = int(weibo_footer[0]) - print(u"点赞数: " + str(up_num)) - footer["up_num"] = up_num + print(u'点赞数: ' + str(up_num)) + footer['up_num'] = up_num retweet_num = int(weibo_footer[1]) - print(u"转发数: " + str(retweet_num)) - footer["retweet_num"] = retweet_num + print(u'转发数: ' + str(retweet_num)) + footer['retweet_num'] = retweet_num comment_num = int(weibo_footer[2]) - print(u"评论数: " + str(comment_num)) - footer["comment_num"] = comment_num + print(u'评论数: ' + str(comment_num)) + footer['comment_num'] = comment_num return footer except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def extract_picture_urls(self, info, weibo_id): """提取微博原始图片url""" try: - a_list = info.xpath("div/a/@href") - first_pic = "https://weibo.cn/mblog/pic/" + weibo_id + "?rl=0" - all_pic = "https://weibo.cn/mblog/picAll/" + weibo_id + "?rl=1" + a_list = info.xpath('div/a/@href') + first_pic = 'https://weibo.cn/mblog/pic/' + weibo_id + '?rl=0' + all_pic = 'https://weibo.cn/mblog/picAll/' + weibo_id + '?rl=1' if first_pic in a_list: if all_pic in a_list: selector = self.deal_html(all_pic) - preview_picture_list = selector.xpath("//img/@src") + preview_picture_list = selector.xpath('//img/@src') picture_list = [ - p.replace("/thumb180/", "/large/") + p.replace('/thumb180/', '/large/') for p in preview_picture_list ] - picture_urls = ",".join(picture_list) + picture_urls = ','.join(picture_list) else: - preview_picture = info.xpath(".//img/@src")[-1] + preview_picture = info.xpath('.//img/@src')[-1] picture_urls = preview_picture.replace( - "/wap180/", "/large/") + '/wap180/', '/large/') else: - picture_urls = "无" + picture_urls = '无' return picture_urls except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_picture_urls(self, info, is_original): """获取微博原始图片url""" try: - weibo_id = info.xpath("@id")[0][2:] + weibo_id = info.xpath('@id')[0][2:] picture_urls = {} if is_original: original_pictures = self.extract_picture_urls(info, weibo_id) - picture_urls["original_pictures"] = original_pictures + picture_urls['original_pictures'] = original_pictures if not self.filter: - picture_urls["retweet_pictures"] = "无" + picture_urls['retweet_pictures'] = '无' else: retweet_url = info.xpath("div/a[@class='cc']/@href")[0] - retweet_id = retweet_url.split("/")[-1].split("?")[0] + retweet_id = retweet_url.split('/')[-1].split('?')[0] retweet_pictures = self.extract_picture_urls(info, retweet_id) - picture_urls["retweet_pictures"] = retweet_pictures - a_list = info.xpath("div[last()]/a/@href") - original_picture = "无" + picture_urls['retweet_pictures'] = retweet_pictures + a_list = info.xpath('div[last()]/a/@href') + original_picture = '无' for a in a_list: - if a.endswith((".gif", ".jpeg", ".jpg", ".png")): + if a.endswith(('.gif', '.jpeg', '.jpg', '.png')): original_picture = a break - picture_urls["original_pictures"] = original_picture + picture_urls['original_pictures'] = original_picture return picture_urls except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def download_pic(self, url, pic_path): """下载单张图片""" try: p = requests.get(url) - with open(pic_path, "wb") as f: + with open(pic_path, 'wb') as f: f.write(p.content) except Exception as e: error_file = self.get_filepath( - "img") + os.sep + "not_downloaded_pictures.txt" - with open(error_file, "ab") as f: - url = url + "\n" + 'img') + os.sep + 'not_downloaded_pictures.txt' + with open(error_file, 'ab') as f: + url = url + '\n' f.write(url.encode(sys.stdout.encoding)) - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def download_pictures(self): """下载微博图片""" try: - print(u"即将进行图片下载") - img_dir = self.get_filepath("img") - for w in tqdm(self.weibo, desc=u"图片下载进度"): - if w["original_pictures"] != "无": - pic_prefix = w["publish_time"][:11].replace( - "-", "") + "_" + w["id"] - if "," in w["original_pictures"]: - w["original_pictures"] = w["original_pictures"].split( - ",") - for j, url in enumerate(w["original_pictures"]): - pic_suffix = url[url.rfind("."):] - pic_name = pic_prefix + "_" + str(j + + print(u'即将进行图片下载') + img_dir = self.get_filepath('img') + for w in tqdm(self.weibo, desc=u'图片下载进度'): + if w['original_pictures'] != '无': + pic_prefix = w['publish_time'][:11].replace( + '-', '') + '_' + w['id'] + if ',' in w['original_pictures']: + w['original_pictures'] = w['original_pictures'].split( + ',') + for j, url in enumerate(w['original_pictures']): + pic_suffix = url[url.rfind('.'):] + pic_name = pic_prefix + '_' + str(j + 1) + pic_suffix pic_path = img_dir + os.sep + pic_name self.download_pic(url, pic_path) else: - pic_suffix = w["original_pictures"][ - w["original_pictures"].rfind("."):] + pic_suffix = w['original_pictures'][ + w['original_pictures'].rfind('.'):] pic_name = pic_prefix + pic_suffix pic_path = img_dir + os.sep + pic_name - self.download_pic(w["original_pictures"], pic_path) - print(u"图片下载完毕,保存路径:") + self.download_pic(w['original_pictures'], pic_path) + print(u'图片下载完毕,保存路径:') print(img_dir) except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_one_weibo(self, info): @@ -396,34 +396,34 @@ def get_one_weibo(self, info): weibo = OrderedDict() is_original = self.is_original(info) if (not self.filter) or is_original: - weibo["id"] = info.xpath("@id")[0][2:] - weibo["content"] = self.get_weibo_content(info, + weibo['id'] = info.xpath('@id')[0][2:] + weibo['content'] = self.get_weibo_content(info, is_original) # 微博内容 picture_urls = self.get_picture_urls(info, is_original) - weibo["original_pictures"] = picture_urls[ - "original_pictures"] # 原创图片url + weibo['original_pictures'] = picture_urls[ + 'original_pictures'] # 原创图片url if not self.filter: - weibo["retweet_pictures"] = picture_urls[ - "retweet_pictures"] # 转发图片url - weibo["original"] = is_original # 是否原创微博 - weibo["publish_place"] = self.get_publish_place(info) # 微博发布位置 - weibo["publish_time"] = self.get_publish_time(info) # 微博发布时间 - weibo["publish_tool"] = self.get_publish_tool(info) # 微博发布工具 + weibo['retweet_pictures'] = picture_urls[ + 'retweet_pictures'] # 转发图片url + weibo['original'] = is_original # 是否原创微博 + weibo['publish_place'] = self.get_publish_place(info) # 微博发布位置 + weibo['publish_time'] = self.get_publish_time(info) # 微博发布时间 + weibo['publish_tool'] = self.get_publish_tool(info) # 微博发布工具 footer = self.get_weibo_footer(info) - weibo["up_num"] = footer["up_num"] # 微博点赞数 - weibo["retweet_num"] = footer["retweet_num"] # 转发数 - weibo["comment_num"] = footer["comment_num"] # 评论数 + weibo['up_num'] = footer['up_num'] # 微博点赞数 + weibo['retweet_num'] = footer['retweet_num'] # 转发数 + weibo['comment_num'] = footer['comment_num'] # 评论数 else: weibo = None return weibo except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_one_page(self, page): """获取第page页的全部微博""" try: - url = "https://weibo.cn/u/%d?page=%d" % (self.user_id, page) + url = 'https://weibo.cn/u/%d?page=%d' % (self.user_id, page) selector = self.deal_html(url) info = selector.xpath("//div[@class='c']") is_exist = info[0].xpath("div/span[@class='ctt']") @@ -433,68 +433,68 @@ def get_one_page(self, page): if weibo: self.weibo.append(weibo) self.got_num += 1 - print("-" * 100) + print('-' * 100) except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_filepath(self, type): """获取结果文件路径""" try: file_dir = os.path.split(os.path.realpath( - __file__))[0] + os.sep + "weibo" + os.sep + self.nickname - if type == "img": - file_dir = file_dir + os.sep + "img" + __file__))[0] + os.sep + 'weibo' + os.sep + self.nickname + if type == 'img': + file_dir = file_dir + os.sep + 'img' if not os.path.isdir(file_dir): os.makedirs(file_dir) - if type == "img": + if type == 'img': return file_dir - file_path = file_dir + os.sep + "%d" % self.user_id + "." + type + file_path = file_dir + os.sep + '%d' % self.user_id + '.' + type return file_path except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def write_csv(self, wrote_num): """将爬取的信息写入csv文件""" try: result_headers = [ - "微博id", - "微博正文", - "原始图片url", - "发布位置", - "发布时间", - "发布工具", - "点赞数", - "转发数", - "评论数", + '微博id', + '微博正文', + '原始图片url', + '发布位置', + '发布时间', + '发布工具', + '点赞数', + '转发数', + '评论数', ] if not self.filter: - result_headers.insert(3, "被转发微博原始图片url") - result_headers.insert(4, "是否为原创微博") + result_headers.insert(3, '被转发微博原始图片url') + result_headers.insert(4, '是否为原创微博') result_data = [w.values() for w in self.weibo][wrote_num:] - if sys.version < "3": # python2.x + if sys.version < '3': # python2.x reload(sys) - sys.setdefaultencoding("utf-8") - with open(self.get_filepath("csv"), "ab") as f: + sys.setdefaultencoding('utf-8') + with open(self.get_filepath('csv'), 'ab') as f: f.write(codecs.BOM_UTF8) writer = csv.writer(f) if wrote_num == 0: writer.writerows([result_headers]) writer.writerows(result_data) else: # python3.x - with open(self.get_filepath("csv"), - "a", - encoding="utf-8-sig", - newline="") as f: + with open(self.get_filepath('csv'), + 'a', + encoding='utf-8-sig', + newline='') as f: writer = csv.writer(f) if wrote_num == 0: writer.writerows([result_headers]) writer.writerows(result_data) - print(u"%d条微博写入csv文件完毕,保存路径:" % self.got_num) - print(self.get_filepath("csv")) + print(u'%d条微博写入csv文件完毕,保存路径:' % self.got_num) + print(self.get_filepath('csv')) except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def write_txt(self, wrote_num): @@ -503,30 +503,30 @@ def write_txt(self, wrote_num): temp_result = [] if wrote_num == 0: if self.filter: - result_header = u"\n\n原创微博内容: \n" + result_header = u'\n\n原创微博内容: \n' else: - result_header = u"\n\n微博内容: \n" - result_header = (u"用户信息\n用户昵称:" + self.nickname + u"\n用户id: " + - str(self.user_id) + u"\n微博数: " + - str(self.weibo_num) + u"\n关注数: " + - str(self.following) + u"\n粉丝数: " + + result_header = u'\n\n微博内容: \n' + result_header = (u'用户信息\n用户昵称:' + self.nickname + u'\n用户id: ' + + str(self.user_id) + u'\n微博数: ' + + str(self.weibo_num) + u'\n关注数: ' + + str(self.following) + u'\n粉丝数: ' + str(self.followers) + result_header) temp_result.append(result_header) for i, w in enumerate(self.weibo[wrote_num:]): temp_result.append( - str(wrote_num + i + 1) + ":" + w["content"] + "\n" + - u"微博位置: " + w["publish_place"] + "\n" + u"发布时间: " + - w["publish_time"] + "\n" + u"点赞数: " + str(w["up_num"]) + - u" 转发数: " + str(w["retweet_num"]) + u" 评论数: " + - str(w["comment_num"]) + "\n" + u"发布工具: " + - w["publish_tool"] + "\n\n") - result = "".join(temp_result) - with open(self.get_filepath("txt"), "ab") as f: + str(wrote_num + i + 1) + ':' + w['content'] + '\n' + + u'微博位置: ' + w['publish_place'] + '\n' + u'发布时间: ' + + w['publish_time'] + '\n' + u'点赞数: ' + str(w['up_num']) + + u' 转发数: ' + str(w['retweet_num']) + u' 评论数: ' + + str(w['comment_num']) + '\n' + u'发布工具: ' + + w['publish_tool'] + '\n\n') + result = ''.join(temp_result) + with open(self.get_filepath('txt'), 'ab') as f: f.write(result.encode(sys.stdout.encoding)) - print(u"%d条微博写入txt文件完毕,保存路径:" % self.got_num) - print(self.get_filepath("txt")) + print(u'%d条微博写入txt文件完毕,保存路径:' % self.got_num) + print(self.get_filepath('txt')) except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def write_file(self, wrote_num): @@ -538,14 +538,14 @@ def write_file(self, wrote_num): def get_weibo_info(self): """获取微博信息""" try: - url = "https://weibo.cn/u/%d" % (self.user_id) + url = 'https://weibo.cn/u/%d' % (self.user_id) selector = self.deal_html(url) self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 page_num = self.get_page_num(selector) # 获取微博总页数 wrote_num = 0 page1 = 0 random_pages = random.randint(1, 5) - for page in tqdm(range(1, page_num + 1), desc=u"进度"): + for page in tqdm(range(1, page_num + 1), desc=u'进度'): self.get_one_page(page) # 获取第page页的全部微博 if page % 20 == 0: # 每爬20页写入一次文件 @@ -562,23 +562,23 @@ def get_weibo_info(self): self.write_file(wrote_num) # 将剩余不足20页的微博写入文件 if not self.filter: - print(u"共爬取" + str(self.got_num) + u"条微博") + print(u'共爬取' + str(self.got_num) + u'条微博') else: - print(u"共爬取" + str(self.got_num) + u"条原创微博") + print(u'共爬取' + str(self.got_num) + u'条原创微博') except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def start(self): """运行爬虫""" try: self.get_weibo_info() - print(u"信息抓取完毕") - print("*" * 100) + print(u'信息抓取完毕') + print('*' * 100) if self.pic_download == 1: self.download_pictures() except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() @@ -590,22 +590,22 @@ def main(): pic_download = 1 # 值为0代表不下载微博原始图片,1代表下载微博原始图片 wb = Weibo(user_id, filter, pic_download) # 调用Weibo类,创建微博实例wb wb.start() # 爬取微博信息 - print(u"用户昵称: " + wb.nickname) - print(u"全部微博数: " + str(wb.weibo_num)) - print(u"关注数: " + str(wb.following)) - print(u"粉丝数: " + str(wb.followers)) + print(u'用户昵称: ' + wb.nickname) + print(u'全部微博数: ' + str(wb.weibo_num)) + print(u'关注数: ' + str(wb.following)) + print(u'粉丝数: ' + str(wb.followers)) if wb.weibo: - print(u"最新/置顶 微博为: " + wb.weibo[0]["content"]) - print(u"最新/置顶 微博位置: " + wb.weibo[0]["publish_place"]) - print(u"最新/置顶 微博发布时间: " + wb.weibo[0]["publish_time"]) - print(u"最新/置顶 微博获得赞数: " + str(wb.weibo[0]["up_num"])) - print(u"最新/置顶 微博获得转发数: " + str(wb.weibo[0]["retweet_num"])) - print(u"最新/置顶 微博获得评论数: " + str(wb.weibo[0]["comment_num"])) - print(u"最新/置顶 微博发布工具: " + wb.weibo[0]["publish_tool"]) + print(u'最新/置顶 微博为: ' + wb.weibo[0]['content']) + print(u'最新/置顶 微博位置: ' + wb.weibo[0]['publish_place']) + print(u'最新/置顶 微博发布时间: ' + wb.weibo[0]['publish_time']) + print(u'最新/置顶 微博获得赞数: ' + str(wb.weibo[0]['up_num'])) + print(u'最新/置顶 微博获得转发数: ' + str(wb.weibo[0]['retweet_num'])) + print(u'最新/置顶 微博获得评论数: ' + str(wb.weibo[0]['comment_num'])) + print(u'最新/置顶 微博发布工具: ' + wb.weibo[0]['publish_tool']) except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() -if __name__ == "__main__": +if __name__ == '__main__': main() From 45f2eddf70f7d2c853a89d078882bcf4bede2918 Mon Sep 17 00:00:00 2001 From: chenlei Date: Wed, 19 Jun 2019 22:31:21 +0800 Subject: [PATCH 043/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E4=BA=86?= =?UTF-8?q?=E5=9B=A0=E9=83=A8=E5=88=86=E7=88=AC=E8=99=AB=E5=BE=AE=E5=8D=9A?= =?UTF-8?q?=E8=A2=AB=E8=AE=BE=E7=BD=AE=E6=88=90=E4=BA=86"=E4=B8=8D?= =?UTF-8?q?=E6=98=BE=E7=A4=BA=E5=9B=BE=E7=89=87"=E5=AF=BC=E8=87=B4?= =?UTF-8?q?=E7=A8=8B=E5=BA=8F=E5=87=BA=E9=94=99=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #12 --- weiboSpider.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 109ea8c0..64fd33d7 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -308,9 +308,15 @@ def extract_picture_urls(self, info, weibo_id): ] picture_urls = ','.join(picture_list) else: - preview_picture = info.xpath('.//img/@src')[-1] - picture_urls = preview_picture.replace( - '/wap180/', '/large/') + if info.xpath('.//img/@src'): + preview_picture = info.xpath('.//img/@src')[-1] + picture_urls = preview_picture.replace( + '/wap180/', '/large/') + else: + sys.exit( + u"爬虫微博可能被设置成了'不显示图片',请前往" + u"'https://weibo.cn/account/customize/pic',修改为'显示'" + ) else: picture_urls = '无' return picture_urls From d544a158db8bb0af39544ac82808e794bd738f76 Mon Sep 17 00:00:00 2001 From: chenlei Date: Tue, 25 Jun 2019 18:53:06 +0800 Subject: [PATCH 044/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E7=88=AC?= =?UTF-8?q?=E5=8F=96=E5=AE=8C=E6=88=90=E5=90=8E=E5=8F=AF=E8=83=BD=E7=AD=89?= =?UTF-8?q?=E5=BE=85=E8=8B=A5=E5=B9=B2=E7=A7=92=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weiboSpider.py b/weiboSpider.py index 64fd33d7..fd5ddb3b 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -561,7 +561,7 @@ def get_weibo_info(self): # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限 # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默 # 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间 - if page - page1 == random_pages: + if page - page1 == random_pages and page < page_num: sleep(random.randint(6, 10)) page1 = page random_pages = random.randint(1, 5) From 9c6a42a66f9afae078e7ac7564d8ea2f96240258 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sat, 29 Jun 2019 20:52:26 +0800 Subject: [PATCH 045/363] Update README.md --- README.md | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 9095c642..3af1cbd9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,9 @@ # 功能 爬取新浪微博信息,并写入csv/txt文件,文件名为目标用户id加".csv"和".txt"的形式,同时还会下载该微博原始图片(可选)。

+本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需免cookie版,大家可以访问, +二者功能类似,免cookie版因为不需要cookie,用法更简单,但功能却更多。
+
以爬取迪丽热巴的微博为例,她的微博昵称为"Dear-迪丽热巴",id为1669879400(后面会讲如何获取用户id)。我们选择爬取她的原创微博。程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件和一个img文件夹,img文件夹用来存储下载到的图片。

csv文件结果如下所示: @@ -12,7 +15,7 @@ txt文件结果如下所示: 本次下载了766张图片,大小一共1.15GB,包括她原创微博中的图片和转发微博转发理由中的图片。图片名为yyyymmdd+微博id的形式,若某条微博存在多张图片,则图片名中还会包括它在微博图片中的序号。本次下载有一张图片因为超时没有下载下来,该图片url被写到了not_downloaded_pictures.txt。 # 输入 -用户id,例如新浪微博昵称为“Dear-迪丽热巴”的id为“1669879400” +用户id,例如新浪微博昵称为"Dear-迪丽热巴"的id为"1669879400" # 输出 - 昵称:用户昵称,如"Dear-迪丽热巴" @@ -42,7 +45,7 @@ $ git clone https://github.com/dataabc/weibospider.git ``` 运行上述命令,将本项目下载到当前目录,如果下载成功当前目录会出现一个名为"weibospider"的文件夹; ## 2.设置cookie和user_id -打开weibospider文件夹下的"**weibospider.py**"文件,将“**your cookie**”替换成爬虫微博的cookie,后面会详细讲解如何获取cookie;将**user_id**替换成想要爬取的微博的user_id,后面会详细讲解如何获取user_id; +打开weibospider文件夹下的"**weibospider.py**"文件,将"**your cookie**"替换成爬虫微博的cookie,后面会详细讲解如何获取cookie;将**user_id**替换成想要爬取的微博的user_id,后面会详细讲解如何获取user_id; ## 3.运行脚本 大家可以根据自己的运行环境选择运行方式,Linux可以通过 ```bash @@ -64,18 +67,23 @@ user_id可以改成任意合法的用户id(爬虫的微博id除外);filter **wb.weibo_num**:微博数;
**wb.following**:关注数;
**wb.followers**:粉丝数;
-**wb.weibo**:除不包含上述信息外,wb.weibo包含爬取到的所有微博信息,如**微博id**、**微博正文**、**原始图片url**、**发布位置**、**发布时间**、**发布工具**、**点赞数**、**转发数**、**评论数**等。如果爬的是全部微博(原创+转发),除上述信息之外,还包含被**转发微博原始图片url**、**是否为原创微博**等。wb.weibo是一个列表,包含了爬取的所有微博信息。wb.weibo[0]为爬取的第一条微博,wb.weibo[1]为爬取的第二条微博,以此类推。当filter=1时,wb.weibo[0]为爬取的第一条**原创**微博,以此类推。wb.weibo[0]["id"]为第一条微博的id,wb.weibo[0]["content"]为第一条微博的正文,wb.weibo[0]["publish_time"]为第一条微博的发布时间,还有其它很多信息不在赘述,大家可以点击下面的"详情"查看具体用法。 +**wb.weibo**:除不包含上述信息外,wb.weibo包含爬取到的所有微博信息,如**微博id**、**微博正文**、**原始图片url**、**发布位置**、**发布时间**、**发布工具**、**点赞数**、**转发数**、**评论数**等。如果爬的是全部微博(原创+转发),除上述信息之外,还包含被**转发微博原始图片url**、**是否为原创微博**等。wb.weibo是一个列表,包含了爬取的所有微博信息。wb.weibo[0]为爬取的第一条微博,wb.weibo[1]为爬取的第二条微博,以此类推。当filter=1时,wb.weibo[0]为爬取的第一条**原创**微博,以此类推。wb.weibo[0]['id']为第一条微博的id,wb.weibo[0]['content']为第一条微博的正文,wb.weibo[0]['publish_time']为第一条微博的发布时间,还有其它很多信息不在赘述,大家可以点击下面的"详情"查看具体用法。
详情 -**original_pictures**:存储原创微博的原始图片url和转发微博转发理由中的图片url。如wb.weibo[0]["original_pictures"]为最新一条微博的原始图片url,若该条微博有多张图片,则存储多个url,以英文逗号分割;若该微博没有图片,则值为"无";
+ +若目标微博用户存在微博,则:
+**id**:存储微博id。如wb.weibo[0]['id']为最新一条微博的id;
+**content**:存储微博正文。如wb.weibo[0]['content']为最新一条微博的正文;
+**original_pictures**:存储原创微博的原始图片url和转发微博转发理由中的图片url。如wb.weibo[0]['original_pictures']为最新一条微博的原始图片url,若该条微博有多张图片,则存储多个url,以英文逗号分割;若该微博没有图片,则值为"无";
**retweet_pictures**:存储被转发微博中的原始图片url。当最新微博为原创微博或者为没有图片的转发微博时,则值为"无",否则为被转发微博的图片url。若有多张图片,则存储多个url,以英文逗号分割;
-**publish_place**:存储微博的发布位置。如wb.weibo[0]["publish_place"]为最新一条微博的发布位置,如果该条微博没有位置信息,则值为"无";
-**publish_time**:存储微博的发布时间。如wb.weibo[0]["publish_time"]为最新一条微博的发布时间;
-**up_num**:存储微博获得的点赞数。如wb.weibo[0]["up_num"]为最新一条微博获得的点赞数;
-**retweet_num**:存储微博获得的转发数。如wb.weibo[0]["retweet_num"]为最新一条微博获得的转发数;
-**comment_num**:存储微博获得的评论数。如wb.weibo[0]["comment_num"]为最新一条微博获得的评论数;
-**publish_tool**:存储微博的发布工具。如wb.weibo[0]["publish_tool"]为最新一条微博的发布工具。 +**publish_place**:存储微博的发布位置。如wb.weibo[0]['publish_place']为最新一条微博的发布位置,如果该条微博没有位置信息,则值为"无";
+**publish_time**:存储微博的发布时间。如wb.weibo[0]['publish_time']为最新一条微博的发布时间;
+**up_num**:存储微博获得的点赞数。如wb.weibo[0]['up_num']为最新一条微博获得的点赞数;
+**retweet_num**:存储微博获得的转发数。如wb.weibo[0]['retweet_num']为最新一条微博获得的转发数;
+**comment_num**:存储微博获得的评论数。如wb.weibo[0]['comment_num']为最新一条微博获得的评论数;
+**publish_tool**:存储微博的发布工具。如wb.weibo[0]['publish_tool']为最新一条微博的发布工具。 +
# 如何获取cookie @@ -89,7 +97,7 @@ user_id可以改成任意合法的用户id(爬虫的微博id除外);filter ![](https://picture.cognize.me/cognize/github/weibospider/cookie3.png) # 如何获取user_id -1.打开网址,搜索我们要找的人,如”迪丽热巴“,进入她的主页;
+1.打开网址,搜索我们要找的人,如"迪丽热巴",进入她的主页;
![](https://picture.cognize.me/cognize/github/weibospider/user_home.png) 2.按照上图箭头所指,点击"资料"链接,跳转到用户资料页面;
![](https://picture.cognize.me/cognize/github/weibospider/user_info.png) From bd2000611d9aed4b06abb7239b19febc32d0dfeb Mon Sep 17 00:00:00 2001 From: chenlei Date: Sat, 27 Jul 2019 17:55:23 +0800 Subject: [PATCH 046/363] add requirements.txt --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..d0747725 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +lxml==4.3.4 +requests==2.22.0 +tqdm==4.32.2 From 98a485c9f3c94e36ab3c50dee974c328c767c2c2 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sat, 27 Jul 2019 18:05:54 +0800 Subject: [PATCH 047/363] Update README.md --- README.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3af1cbd9..d94ad41c 100644 --- a/README.md +++ b/README.md @@ -44,15 +44,19 @@ txt文件结果如下所示: $ git clone https://github.com/dataabc/weibospider.git ``` 运行上述命令,将本项目下载到当前目录,如果下载成功当前目录会出现一个名为"weibospider"的文件夹; -## 2.设置cookie和user_id +## 2.安装依赖 +```bash +pip install -r requirements.txt +``` +## 3.设置cookie和user_id 打开weibospider文件夹下的"**weibospider.py**"文件,将"**your cookie**"替换成爬虫微博的cookie,后面会详细讲解如何获取cookie;将**user_id**替换成想要爬取的微博的user_id,后面会详细讲解如何获取user_id; -## 3.运行脚本 +## 4.运行脚本 大家可以根据自己的运行环境选择运行方式,Linux可以通过 ```bash $ python weibospider.py ``` 运行; -## 4.按需求修改脚本(可选) +## 5.按需求修改脚本(可选) 本脚本是一个Weibo类,用户可以按照自己的需求调用Weibo类。 例如用户可以直接在"weibospider.py"文件中调用Weibo类,具体调用代码示例如下: ```python From c0862b972af92b1b7981f15c7be6806e523d981d Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sun, 4 Aug 2019 20:20:41 +0800 Subject: [PATCH 048/363] Update README.md --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index d94ad41c..d4877978 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,17 @@ +* [功能](#功能) +* [输入](#输入) +* [输出](#输出) +* [运行环境](#运行环境) +* [使用说明](#使用说明) + * [下载脚本](#1下载脚本) + * [安装依赖](#2安装依赖) + * [设置cookie和user_id](#3设置cookie和user_id) + * [运行脚本](#4运行脚本) + * [按需求修改脚本](#5按需求修改脚本可选) +* [如何获取cookie](#如何获取cookie) +* [如何获取user_id](#如何获取user_id) +* [注意事项](#注意事项) + # 功能 爬取新浪微博信息,并写入csv/txt文件,文件名为目标用户id加".csv"和".txt"的形式,同时还会下载该微博原始图片(可选)。

From 00d988fdc2843aea95ec4b98f2823f166f9a0bf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Mon, 12 Aug 2019 19:23:16 +0800 Subject: [PATCH 049/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=9C=A8?= =?UTF-8?q?=E8=8E=B7=E5=8F=96=E5=9B=BE=E7=89=87=E5=9C=B0=E5=9D=80=E6=97=B6?= =?UTF-8?q?=E5=9B=A0=E7=BD=91=E7=BB=9C=E5=8E=9F=E5=9B=A0=E5=87=BA=E7=8E=B0?= =?UTF-8?q?"argument=20of=20type=20'NoneType'=20is=20not=20iterable"?= =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #78 --- weiboSpider.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/weiboSpider.py b/weiboSpider.py index fd5ddb3b..8537e8b2 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -115,6 +115,7 @@ def get_long_weibo(self, weibo_link): 1:wb_content.rfind(wb_time)] return weibo_content except Exception as e: + return '网络出错' print('Error: ', e) traceback.print_exc() @@ -321,6 +322,7 @@ def extract_picture_urls(self, info, weibo_id): picture_urls = '无' return picture_urls except Exception as e: + return '无' print('Error: ', e) traceback.print_exc() From 3aca99af8d8370144d38828de310b42dc92c9612 Mon Sep 17 00:00:00 2001 From: chenlei Date: Tue, 13 Aug 2019 01:36:14 +0800 Subject: [PATCH 050/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E5=BE=AE=E5=8D=9A=E8=A7=86=E9=A2=91url=E5=8A=9F?= =?UTF-8?q?=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #77 --- weiboSpider.py | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 8537e8b2..3cf9f23f 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -115,7 +115,7 @@ def get_long_weibo(self, weibo_link): 1:wb_content.rfind(wb_time)] return weibo_content except Exception as e: - return '网络出错' + return u'网络出错' print('Error: ', e) traceback.print_exc() @@ -319,10 +319,10 @@ def extract_picture_urls(self, info, weibo_id): u"'https://weibo.cn/account/customize/pic',修改为'显示'" ) else: - picture_urls = '无' + picture_urls = u'无' return picture_urls except Exception as e: - return '无' + return u'无' print('Error: ', e) traceback.print_exc() @@ -335,14 +335,14 @@ def get_picture_urls(self, info, is_original): original_pictures = self.extract_picture_urls(info, weibo_id) picture_urls['original_pictures'] = original_pictures if not self.filter: - picture_urls['retweet_pictures'] = '无' + picture_urls['retweet_pictures'] = u'无' else: retweet_url = info.xpath("div/a[@class='cc']/@href")[0] retweet_id = retweet_url.split('/')[-1].split('?')[0] retweet_pictures = self.extract_picture_urls(info, retweet_id) picture_urls['retweet_pictures'] = retweet_pictures a_list = info.xpath('div[last()]/a/@href') - original_picture = '无' + original_picture = u'无' for a in a_list: if a.endswith(('.gif', '.jpeg', '.jpg', '.png')): original_picture = a @@ -353,6 +353,35 @@ def get_picture_urls(self, info, is_original): print('Error: ', e) traceback.print_exc() + def get_video_url(self, info, is_original): + """获取微博视频url""" + try: + if is_original: + div_first = info.xpath('div')[0] + a_list = div_first.xpath('.//a') + video_link = u'无' + for a in a_list: + if 'm.weibo.cn/s/video/show?object_id=' in a.xpath( + '@href')[0]: + video_link = a.xpath('@href')[0] + break + if video_link != u'无': + video_link = video_link.replace( + 'm.weibo.cn/s/video/show', 'm.weibo.cn/s/video/object') + wb_info = requests.get(video_link, + cookies=self.cookie).json() + video_url = wb_info['data']['object']['stream'].get( + 'hd_url') + if not video_url: + video_url = wb_info['data']['object']['stream']['url'] + else: + video_url = u'无' + return video_url + except Exception as e: + return u'无' + print('Error: ', e) + traceback.print_exc() + def download_pic(self, url, pic_path): """下载单张图片""" try: @@ -374,7 +403,7 @@ def download_pictures(self): print(u'即将进行图片下载') img_dir = self.get_filepath('img') for w in tqdm(self.weibo, desc=u'图片下载进度'): - if w['original_pictures'] != '无': + if w['original_pictures'] != u'无': pic_prefix = w['publish_time'][:11].replace( '-', '') + '_' + w['id'] if ',' in w['original_pictures']: @@ -414,6 +443,8 @@ def get_one_weibo(self, info): weibo['retweet_pictures'] = picture_urls[ 'retweet_pictures'] # 转发图片url weibo['original'] = is_original # 是否原创微博 + weibo['video_url'] = self.get_video_url(info, + is_original) # 微博视频url weibo['publish_place'] = self.get_publish_place(info) # 微博发布位置 weibo['publish_time'] = self.get_publish_time(info) # 微博发布时间 weibo['publish_tool'] = self.get_publish_tool(info) # 微博发布工具 @@ -470,6 +501,7 @@ def write_csv(self, wrote_num): '微博id', '微博正文', '原始图片url', + '微博视频url', '发布位置', '发布时间', '发布工具', From b353e669341e7587031eacd80e0341abac8e4563 Mon Sep 17 00:00:00 2001 From: chenlei Date: Tue, 13 Aug 2019 20:16:32 +0800 Subject: [PATCH 051/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=BE=AE?= =?UTF-8?q?=E5=8D=9A=E8=A7=86=E9=A2=91=E4=B8=8B=E8=BD=BD=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #77 --- weiboSpider.py | 88 +++++++++++++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 36 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 3cf9f23f..6161efab 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -20,7 +20,7 @@ class Weibo(object): cookie = {'Cookie': 'your cookie'} # 将your cookie替换成自己的cookie - def __init__(self, user_id, filter=0, pic_download=0): + def __init__(self, user_id, filter=0, pic_download=0, video_download=0): """Weibo类初始化""" if not isinstance(user_id, int): sys.exit(u'user_id值应为一串数字形式,请重新输入') @@ -28,9 +28,12 @@ def __init__(self, user_id, filter=0, pic_download=0): sys.exit(u'filter值应为0或1,请重新输入') if pic_download != 0 and pic_download != 1: sys.exit(u'pic_download值应为0或1,请重新输入') + if video_download != 0 and video_download != 1: + sys.exit(u'video_download值应为0或1,请重新输入') self.user_id = user_id # 用户id,即需要我们输入的数字,如昵称为"Dear-迪丽热巴"的id为1669879400 self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 self.pic_download = pic_download # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 + self.video_download = video_download # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 self.nickname = '' # 用户昵称,如“Dear-迪丽热巴” self.weibo_num = 0 # 用户全部微博数 self.got_num = 0 # 爬取到的微博数 @@ -382,47 +385,56 @@ def get_video_url(self, info, is_original): print('Error: ', e) traceback.print_exc() - def download_pic(self, url, pic_path): - """下载单张图片""" + def download_one_file(self, url, file_path, type, weibo_id): + """下载单个文件(图片/视频)""" try: - p = requests.get(url) - with open(pic_path, 'wb') as f: - f.write(p.content) + downloaded = requests.get(url) + with open(file_path, 'wb') as f: + f.write(downloaded.content) except Exception as e: error_file = self.get_filepath( - 'img') + os.sep + 'not_downloaded_pictures.txt' + type) + os.sep + 'not_downloaded.txt' with open(error_file, 'ab') as f: - url = url + '\n' + url = weibo_id + ':' + url + '\n' f.write(url.encode(sys.stdout.encoding)) print('Error: ', e) traceback.print_exc() - def download_pictures(self): - """下载微博图片""" + def download_files(self, type): + """下载文件(图片/视频)""" try: - print(u'即将进行图片下载') - img_dir = self.get_filepath('img') - for w in tqdm(self.weibo, desc=u'图片下载进度'): - if w['original_pictures'] != u'无': - pic_prefix = w['publish_time'][:11].replace( + if type == 'img': + describe = u'图片' + key = 'original_pictures' + else: + describe = u'视频' + key = 'video_url' + print(u'即将进行%s下载' % describe) + file_dir = self.get_filepath(type) + for w in tqdm(self.weibo, desc=u'%s下载进度' % describe): + if w[key] != u'无': + file_prefix = w['publish_time'][:11].replace( '-', '') + '_' + w['id'] - if ',' in w['original_pictures']: - w['original_pictures'] = w['original_pictures'].split( - ',') - for j, url in enumerate(w['original_pictures']): - pic_suffix = url[url.rfind('.'):] - pic_name = pic_prefix + '_' + str(j + - 1) + pic_suffix - pic_path = img_dir + os.sep + pic_name - self.download_pic(url, pic_path) + if type == 'img' and ',' in w[key]: + w[key] = w[key].split(',') + for j, url in enumerate(w[key]): + file_suffix = url[url.rfind('.'):] + file_name = file_prefix + '_' + str( + j + 1) + file_suffix + file_path = file_dir + os.sep + file_name + self.download_one_file(url, file_path, type, + w['id']) else: - pic_suffix = w['original_pictures'][ - w['original_pictures'].rfind('.'):] - pic_name = pic_prefix + pic_suffix - pic_path = img_dir + os.sep + pic_name - self.download_pic(w['original_pictures'], pic_path) - print(u'图片下载完毕,保存路径:') - print(img_dir) + if type == 'video': + file_suffix = '.mp4' + else: + file_suffix = w[key][w[key].rfind('.'):] + file_name = file_prefix + file_suffix + file_path = file_dir + os.sep + file_name + self.download_one_file(w[key], file_path, type, + w['id']) + print(u'%s下载完毕,保存路径:' % describe) + print(file_dir) except Exception as e: print('Error: ', e) traceback.print_exc() @@ -482,11 +494,11 @@ def get_filepath(self, type): try: file_dir = os.path.split(os.path.realpath( __file__))[0] + os.sep + 'weibo' + os.sep + self.nickname - if type == 'img': - file_dir = file_dir + os.sep + 'img' + if type == 'img' or type == 'video': + file_dir = file_dir + os.sep + type if not os.path.isdir(file_dir): os.makedirs(file_dir) - if type == 'img': + if type == 'img' or type == 'video': return file_dir file_path = file_dir + os.sep + '%d' % self.user_id + '.' + type return file_path @@ -616,7 +628,9 @@ def start(self): print(u'信息抓取完毕') print('*' * 100) if self.pic_download == 1: - self.download_pictures() + self.download_files('img') + if self.video_download == 1: + self.download_files('video') except Exception as e: print('Error: ', e) traceback.print_exc() @@ -628,7 +642,9 @@ def main(): user_id = 1669879400 # 可以改成任意合法的用户id(爬虫的微博id除外) filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 pic_download = 1 # 值为0代表不下载微博原始图片,1代表下载微博原始图片 - wb = Weibo(user_id, filter, pic_download) # 调用Weibo类,创建微博实例wb + video_download = 1 # 值为0代表不下载微博视频,1代表下载微博视频 + wb = Weibo(user_id, filter, pic_download, + video_download) # 调用Weibo类,创建微博实例wb wb.start() # 爬取微博信息 print(u'用户昵称: ' + wb.nickname) print(u'全部微博数: ' + str(wb.weibo_num)) From e275df0a1faaf1e39c28e2344affac1637e6c51c Mon Sep 17 00:00:00 2001 From: chenlei Date: Wed, 14 Aug 2019 01:29:16 +0800 Subject: [PATCH 052/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E5=BE=AE?= =?UTF-8?q?=E5=8D=9A=E8=A7=86=E9=A2=91=E4=B8=8B=E8=BD=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/weiboSpider.py b/weiboSpider.py index 6161efab..8d04b9dc 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -14,6 +14,7 @@ import requests from lxml import etree +from requests.adapters import HTTPAdapter from tqdm import tqdm @@ -388,7 +389,9 @@ def get_video_url(self, info, is_original): def download_one_file(self, url, file_path, type, weibo_id): """下载单个文件(图片/视频)""" try: - downloaded = requests.get(url) + s = requests.Session() + s.mount(url, HTTPAdapter(max_retries=5)) + downloaded = s.get(url, timeout=(5, 10)) with open(file_path, 'wb') as f: f.write(downloaded.content) except Exception as e: From 5c2f3dc644113e4df70aa294e0385a8a522f278e Mon Sep 17 00:00:00 2001 From: chenlei Date: Wed, 14 Aug 2019 02:07:22 +0800 Subject: [PATCH 053/363] =?UTF-8?q?perf:=20=E5=9C=A8=E8=8E=B7=E5=8F=96?= =?UTF-8?q?=E5=BE=AE=E5=8D=9A=E8=A7=86=E9=A2=91=E5=9C=B0=E5=9D=80=E6=97=B6?= =?UTF-8?q?=EF=BC=8C=E6=B7=BB=E5=8A=A0=E5=AF=B9=E7=9B=B4=E6=92=AD=E7=9A=84?= =?UTF-8?q?=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/weiboSpider.py b/weiboSpider.py index 8d04b9dc..d698e88e 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -378,6 +378,8 @@ def get_video_url(self, info, is_original): 'hd_url') if not video_url: video_url = wb_info['data']['object']['stream']['url'] + if not video_url: # 说明该视频为直播 + video_url = u'无' else: video_url = u'无' return video_url From 8635a04770034cbf5cfb5aaf152bb2ce8d84d500 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 14 Aug 2019 18:35:34 +0800 Subject: [PATCH 054/363] Update README.md --- README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d4877978..a63d6439 100644 --- a/README.md +++ b/README.md @@ -13,20 +13,23 @@ * [注意事项](#注意事项) # 功能 -爬取新浪微博信息,并写入csv/txt文件,文件名为目标用户id加".csv"和".txt"的形式,同时还会下载该微博原始图片(可选)。
+爬取新浪微博信息,并写入csv/txt文件,文件名为目标用户id加".csv"和".txt"的形式,同时还会下载该微博原始图片和微博视频(可选)。

本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需免cookie版,大家可以访问, 二者功能类似,免cookie版因为不需要cookie,用法更简单,但功能却更多。

-以爬取迪丽热巴的微博为例,她的微博昵称为"Dear-迪丽热巴",id为1669879400(后面会讲如何获取用户id)。我们选择爬取她的原创微博。程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件和一个img文件夹,img文件夹用来存储下载到的图片。
+以爬取迪丽热巴的微博为例,她的微博昵称为"Dear-迪丽热巴",id为1669879400(后面会讲如何获取用户id)。我们选择爬取她的原创微博。程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。

csv文件结果如下所示: ![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv*
txt文件结果如下所示: ![](https://picture.cognize.me/cognize/github/weibospider/weibo_txt.png)*1669879400.txt*
下载的图片如下所示: -![](https://picture.cognize.me/cognize/github/weibospider/picture.png)*img文件夹*
-本次下载了766张图片,大小一共1.15GB,包括她原创微博中的图片和转发微博转发理由中的图片。图片名为yyyymmdd+微博id的形式,若某条微博存在多张图片,则图片名中还会包括它在微博图片中的序号。本次下载有一张图片因为超时没有下载下来,该图片url被写到了not_downloaded_pictures.txt。 +![](https://picture.cognize.me/cognize/github/weibospider/img.png)*img文件夹*
+本次下载了793张图片,大小一共1.21GB,包括她原创微博中的图片和转发微博转发理由中的图片。图片名为yyyymmdd+微博id的形式,若某条微博存在多张图片,则图片名中还会包括它在微博图片中的序号。若某张图片因为网络等原因下载失败,程序则会以“weibo_id:pic_url”的形式将出错微博id和图片url写入同文件夹下的not_downloaded.txt里; +下载的视频如下所示: +![](https://picture.cognize.me/cognize/github/weibospider/video.png)*video文件夹*
+本次下载了70个视频,是她原创微博中的视频,视频名为yyyymmdd+微博id的形式。其中有一个视频因为网络原因下载失败,程序将它的微博id和视频url以“weibo_id:video_url”的形式写到了同文件夹下的not_downloaded.txt里。 # 输入 用户id,例如新浪微博昵称为"Dear-迪丽热巴"的id为"1669879400" @@ -47,6 +50,7 @@ txt文件结果如下所示: - 微博发布工具:微博的发布工具,如iPhone客户端、HUAWEI Mate 20 Pro等 - 结果文件:保存在当前目录weibo文件夹下以用户昵称为名的文件夹里,名字为"user_id.csv"和"user_id.txt"的形式 - 微博图片:原创微博中的图片和转发微博转发理由中的图片,保存在以用户昵称为名的文件夹下的img文件夹里 +- 微博视频:原创微博中的视频,保存在以用户昵称为名的文件夹下的video文件夹里 # 运行环境 - 开发语言:python2/python3 From 48f97e3f5504e9534e659fe8bbc7b66be7eb92bc Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 14 Aug 2019 18:40:09 +0800 Subject: [PATCH 055/363] Update README.md --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a63d6439..80712bb4 100644 --- a/README.md +++ b/README.md @@ -22,11 +22,14 @@
csv文件结果如下所示: ![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv*
+
txt文件结果如下所示: ![](https://picture.cognize.me/cognize/github/weibospider/weibo_txt.png)*1669879400.txt*
+
下载的图片如下所示: ![](https://picture.cognize.me/cognize/github/weibospider/img.png)*img文件夹*
-本次下载了793张图片,大小一共1.21GB,包括她原创微博中的图片和转发微博转发理由中的图片。图片名为yyyymmdd+微博id的形式,若某条微博存在多张图片,则图片名中还会包括它在微博图片中的序号。若某张图片因为网络等原因下载失败,程序则会以“weibo_id:pic_url”的形式将出错微博id和图片url写入同文件夹下的not_downloaded.txt里; +本次下载了793张图片,大小一共1.21GB,包括她原创微博中的图片和转发微博转发理由中的图片。图片名为yyyymmdd+微博id的形式,若某条微博存在多张图片,则图片名中还会包括它在微博图片中的序号。若某张图片因为网络等原因下载失败,程序则会以“weibo_id:pic_url”的形式将出错微博id和图片url写入同文件夹下的not_downloaded.txt里;
+
下载的视频如下所示: ![](https://picture.cognize.me/cognize/github/weibospider/video.png)*video文件夹*
本次下载了70个视频,是她原创微博中的视频,视频名为yyyymmdd+微博id的形式。其中有一个视频因为网络原因下载失败,程序将它的微博id和视频url以“weibo_id:video_url”的形式写到了同文件夹下的not_downloaded.txt里。 From aa1005178c98983b30d76e89694814b1888efd98 Mon Sep 17 00:00:00 2001 From: chenlei Date: Tue, 27 Aug 2019 19:05:39 +0800 Subject: [PATCH 056/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E6=8C=89?= =?UTF-8?q?=E6=97=B6=E9=97=B4=E7=88=AC=E5=8F=96=E5=BE=AE=E5=8D=9A=E5=8A=9F?= =?UTF-8?q?=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #67, #80 --- weiboSpider.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index d698e88e..59aafb93 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -21,18 +21,26 @@ class Weibo(object): cookie = {'Cookie': 'your cookie'} # 将your cookie替换成自己的cookie - def __init__(self, user_id, filter=0, pic_download=0, video_download=0): + def __init__(self, + user_id, + filter=0, + since_date='1900-01-01', + pic_download=0, + video_download=0): """Weibo类初始化""" if not isinstance(user_id, int): sys.exit(u'user_id值应为一串数字形式,请重新输入') if filter != 0 and filter != 1: sys.exit(u'filter值应为0或1,请重新输入') + if not self.is_date(since_date): + sys.exit(u'since_date值应为yyyy-mm-dd形式,请重新输入') if pic_download != 0 and pic_download != 1: sys.exit(u'pic_download值应为0或1,请重新输入') if video_download != 0 and video_download != 1: sys.exit(u'video_download值应为0或1,请重新输入') self.user_id = user_id # 用户id,即需要我们输入的数字,如昵称为"Dear-迪丽热巴"的id为1669879400 self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 + self.since_date = since_date # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd self.pic_download = pic_download # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 self.video_download = video_download # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 self.nickname = '' # 用户昵称,如“Dear-迪丽热巴” @@ -42,6 +50,14 @@ def __init__(self, user_id, filter=0, pic_download=0, video_download=0): self.followers = 0 # 用户粉丝数 self.weibo = [] # 存储爬取到的所有微博信息 + def is_date(self, since_date): + """判断日期格式是否正确""" + try: + datetime.strptime(since_date, "%Y-%m-%d") + return True + except ValueError: + return False + def deal_html(self, url): """处理html""" try: @@ -487,6 +503,8 @@ def get_one_page(self, page): for i in range(0, len(info) - 2): weibo = self.get_one_weibo(info[i]) if weibo: + if weibo['publish_time'] < self.since_date: + return True self.weibo.append(weibo) self.got_num += 1 print('-' * 100) @@ -603,7 +621,9 @@ def get_weibo_info(self): page1 = 0 random_pages = random.randint(1, 5) for page in tqdm(range(1, page_num + 1), desc=u'进度'): - self.get_one_page(page) # 获取第page页的全部微博 + is_end = self.get_one_page(page) # 获取第page页的全部微博 + if is_end: + break if page % 20 == 0: # 每爬20页写入一次文件 self.write_file(wrote_num) @@ -646,9 +666,10 @@ def main(): # 使用实例,输入一个用户id,所有信息都会存储在wb实例中 user_id = 1669879400 # 可以改成任意合法的用户id(爬虫的微博id除外) filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 + since_date = '2018-01-01' # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd pic_download = 1 # 值为0代表不下载微博原始图片,1代表下载微博原始图片 video_download = 1 # 值为0代表不下载微博视频,1代表下载微博视频 - wb = Weibo(user_id, filter, pic_download, + wb = Weibo(user_id, filter, since_date, pic_download, video_download) # 调用Weibo类,创建微博实例wb wb.start() # 爬取微博信息 print(u'用户昵称: ' + wb.nickname) From 75d694ba68addf5ef9b7dbc01e5bd98d49c0f063 Mon Sep 17 00:00:00 2001 From: chenlei Date: Tue, 27 Aug 2019 19:28:25 +0800 Subject: [PATCH 057/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=BE=AE?= =?UTF-8?q?=E5=8D=9A=E6=89=93=E5=8D=B0=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 59aafb93..ee58befb 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -209,7 +209,6 @@ def get_weibo_content(self, info, is_original): weibo_content = self.get_original_weibo(info, weibo_id) else: weibo_content = self.get_retweet(info, weibo_id) - print(weibo_content) return weibo_content except Exception as e: print('Error: ', e) @@ -235,7 +234,6 @@ def get_publish_place(self, info): publish_place = u'无' publish_place = self.deal_garbled(publish_place) break - print(u'微博发布位置: ' + publish_place) return publish_place except Exception as e: print('Error: ', e) @@ -266,7 +264,6 @@ def get_publish_time(self, info): publish_time = year + '-' + month + '-' + day + ' ' + time else: publish_time = publish_time[:16] - print(u'微博发布时间: ' + publish_time) return publish_time except Exception as e: print('Error: ', e) @@ -281,7 +278,6 @@ def get_publish_tool(self, info): publish_tool = str_time.split(u'来自')[1] else: publish_tool = u'无' - print(u'微博发布工具: ' + publish_tool) return publish_tool except Exception as e: print('Error: ', e) @@ -298,15 +294,12 @@ def get_weibo_footer(self, info): weibo_footer = re.findall(pattern, str_footer, re.M) up_num = int(weibo_footer[0]) - print(u'点赞数: ' + str(up_num)) footer['up_num'] = up_num retweet_num = int(weibo_footer[1]) - print(u'转发数: ' + str(retweet_num)) footer['retweet_num'] = retweet_num comment_num = int(weibo_footer[2]) - print(u'评论数: ' + str(comment_num)) footer['comment_num'] = comment_num return footer except Exception as e: @@ -492,6 +485,16 @@ def get_one_weibo(self, info): print('Error: ', e) traceback.print_exc() + def print_one_weibo(self, weibo): + """打印一条微博""" + print(weibo['content']) + print(u'微博发布位置:%s' % weibo['publish_place']) + print(u'发布发布时间:%s' % weibo['publish_time']) + print(u'发布发布工具:%s' % weibo['publish_tool']) + print(u'点赞数:%d' % weibo['up_num']) + print(u'转发数:%d' % weibo['retweet_num']) + print(u'评论数:%d' % weibo['comment_num']) + def get_one_page(self, page): """获取第page页的全部微博""" try: @@ -505,6 +508,7 @@ def get_one_page(self, page): if weibo: if weibo['publish_time'] < self.since_date: return True + self.print_one_weibo(weibo) self.weibo.append(weibo) self.got_num += 1 print('-' * 100) From ec74ea91925787e77a4acac674f8cca631393ac4 Mon Sep 17 00:00:00 2001 From: chenlei Date: Wed, 28 Aug 2019 01:08:10 +0800 Subject: [PATCH 058/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E4=B8=8B=E8=BD=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index ee58befb..2bb6e271 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -400,11 +400,12 @@ def get_video_url(self, info, is_original): def download_one_file(self, url, file_path, type, weibo_id): """下载单个文件(图片/视频)""" try: - s = requests.Session() - s.mount(url, HTTPAdapter(max_retries=5)) - downloaded = s.get(url, timeout=(5, 10)) - with open(file_path, 'wb') as f: - f.write(downloaded.content) + if not os.path.isfile(file_path): + s = requests.Session() + s.mount(url, HTTPAdapter(max_retries=5)) + downloaded = s.get(url, timeout=(5, 10)) + with open(file_path, 'wb') as f: + f.write(downloaded.content) except Exception as e: error_file = self.get_filepath( type) + os.sep + 'not_downloaded.txt' From 29b2c1ba028442011cf9b7d7856d375a1c97b3a0 Mon Sep 17 00:00:00 2001 From: chenlei Date: Thu, 29 Aug 2019 20:42:08 +0800 Subject: [PATCH 059/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E8=BF=9E?= =?UTF-8?q?=E7=BB=AD=E7=88=AC=E5=8F=96=E5=A4=9A=E4=B8=AA=E5=BE=AE=E5=8D=9A?= =?UTF-8?q?=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 80 +++++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 33 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 2bb6e271..210545c4 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -22,14 +22,11 @@ class Weibo(object): cookie = {'Cookie': 'your cookie'} # 将your cookie替换成自己的cookie def __init__(self, - user_id, filter=0, since_date='1900-01-01', pic_download=0, video_download=0): """Weibo类初始化""" - if not isinstance(user_id, int): - sys.exit(u'user_id值应为一串数字形式,请重新输入') if filter != 0 and filter != 1: sys.exit(u'filter值应为0或1,请重新输入') if not self.is_date(since_date): @@ -38,7 +35,7 @@ def __init__(self, sys.exit(u'pic_download值应为0或1,请重新输入') if video_download != 0 and video_download != 1: sys.exit(u'video_download值应为0或1,请重新输入') - self.user_id = user_id # 用户id,即需要我们输入的数字,如昵称为"Dear-迪丽热巴"的id为1669879400 + self.user_id = '' # 用户id,如昵称为"Dear-迪丽热巴"的id为'1669879400' self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 self.since_date = since_date # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd self.pic_download = pic_download # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 @@ -81,7 +78,7 @@ def deal_garbled(self, info): def get_nickname(self): """获取用户昵称""" try: - url = 'https://weibo.cn/%d/info' % (self.user_id) + url = 'https://weibo.cn/%s/info' % (self.user_id) selector = self.deal_html(url) nickname = selector.xpath('//title/text()')[0] self.nickname = nickname[:-3] @@ -499,7 +496,7 @@ def print_one_weibo(self, weibo): def get_one_page(self, page): """获取第page页的全部微博""" try: - url = 'https://weibo.cn/u/%d?page=%d' % (self.user_id, page) + url = 'https://weibo.cn/u/%s?page=%d' % (self.user_id, page) selector = self.deal_html(url) info = selector.xpath("//div[@class='c']") is_exist = info[0].xpath("div/span[@class='ctt']") @@ -528,7 +525,7 @@ def get_filepath(self, type): os.makedirs(file_dir) if type == 'img' or type == 'video': return file_dir - file_path = file_dir + os.sep + '%d' % self.user_id + '.' + type + file_path = file_dir + os.sep + self.user_id + '.' + type return file_path except Exception as e: print('Error: ', e) @@ -618,7 +615,7 @@ def write_file(self, wrote_num): def get_weibo_info(self): """获取微博信息""" try: - url = 'https://weibo.cn/u/%d' % (self.user_id) + url = 'https://weibo.cn/u/%s' % (self.user_id) selector = self.deal_html(url) self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 page_num = self.get_page_num(selector) # 获取微博总页数 @@ -651,16 +648,34 @@ def get_weibo_info(self): print('Error: ', e) traceback.print_exc() - def start(self): + def get_user_list(self, file_name): + """获取文件中的微博id信息""" + with open(file_name, 'r') as f: + user_id_list = f.read().splitlines() + return user_id_list + + def initialize_info(self, user_id): + """初始化爬虫信息""" + self.nickname = '' + self.weibo_num = 0 + self.got_num = 0 + self.following = 0 + self.followers = 0 + self.weibo = [] + self.user_id = user_id + + def start(self, user_id_list): """运行爬虫""" try: - self.get_weibo_info() - print(u'信息抓取完毕') - print('*' * 100) - if self.pic_download == 1: - self.download_files('img') - if self.video_download == 1: - self.download_files('video') + for user_id in user_id_list: + self.initialize_info(user_id) + self.get_weibo_info() + print(u'信息抓取完毕') + print('*' * 100) + if self.pic_download == 1: + self.download_files('img') + if self.video_download == 1: + self.download_files('video') except Exception as e: print('Error: ', e) traceback.print_exc() @@ -668,27 +683,26 @@ def start(self): def main(): try: - # 使用实例,输入一个用户id,所有信息都会存储在wb实例中 - user_id = 1669879400 # 可以改成任意合法的用户id(爬虫的微博id除外) + # 以下是程序配置信息,可以根据自己需求修改 filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 since_date = '2018-01-01' # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd pic_download = 1 # 值为0代表不下载微博原始图片,1代表下载微博原始图片 video_download = 1 # 值为0代表不下载微博视频,1代表下载微博视频 - wb = Weibo(user_id, filter, since_date, pic_download, - video_download) # 调用Weibo类,创建微博实例wb - wb.start() # 爬取微博信息 - print(u'用户昵称: ' + wb.nickname) - print(u'全部微博数: ' + str(wb.weibo_num)) - print(u'关注数: ' + str(wb.following)) - print(u'粉丝数: ' + str(wb.followers)) - if wb.weibo: - print(u'最新/置顶 微博为: ' + wb.weibo[0]['content']) - print(u'最新/置顶 微博位置: ' + wb.weibo[0]['publish_place']) - print(u'最新/置顶 微博发布时间: ' + wb.weibo[0]['publish_time']) - print(u'最新/置顶 微博获得赞数: ' + str(wb.weibo[0]['up_num'])) - print(u'最新/置顶 微博获得转发数: ' + str(wb.weibo[0]['retweet_num'])) - print(u'最新/置顶 微博获得评论数: ' + str(wb.weibo[0]['comment_num'])) - print(u'最新/置顶 微博发布工具: ' + wb.weibo[0]['publish_tool']) + + wb = Weibo(filter, since_date, pic_download, video_download) + + # user_id_list包含了要爬的目标微博id,可以是一个,也可以是多个,也可以从文件中读取 + # 爬单个微博,user_id_list如下所示,可以改成任意合法的用户id + user_id_list = ['1669879400'] + + # 爬多个微博,user_id_list如下所示,可以改成任意合法的用户id + # user_id_list = ['1669879400', '1729370543'] + + # 也可以在文件中读取,文件中可以包含很多user_id,每个user_id占一行,文件名任意,类型为txt,位置位于本程序的同目录下, + # 比如文件可以叫user_id_list.txt,读取文件中的user_id_list如下所示: + # user_id_list = wb.get_user_list('user_id_list.txt') + + wb.start(user_id_list) # 爬取微博信息 except Exception as e: print('Error: ', e) traceback.print_exc() From 40ea1e1a3db54f3582de308f0521bf51191bd481 Mon Sep 17 00:00:00 2001 From: chenlei Date: Sat, 31 Aug 2019 20:00:35 +0800 Subject: [PATCH 060/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=B0=86?= =?UTF-8?q?=E7=88=AC=E5=8F=96=E7=BB=93=E6=9E=9C=E5=86=99=E5=85=A5MongoDB?= =?UTF-8?q?=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/weiboSpider.py b/weiboSpider.py index 210545c4..18773993 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -14,6 +14,7 @@ import requests from lxml import etree +# from pymongo import MongoClient from requests.adapters import HTTPAdapter from tqdm import tqdm @@ -606,10 +607,22 @@ def write_txt(self, wrote_num): print('Error: ', e) traceback.print_exc() + # def write_mongodb(self, wrote_num): + # """将爬取的信息写入MongoDB数据库""" + # # 如果想使用此功能,请先确保已安装pymongo + # # 若未安装请运行pip install pymongo + # client = MongoClient() + # db = client['weibo'] + # collection = db['weibo'] + # for w in self.weibo[wrote_num:]: + # if not collection.find_one({'id': w['id']}): + # collection.insert_one(w) + def write_file(self, wrote_num): """写文件""" if self.got_num > wrote_num: self.write_csv(wrote_num) + # self.write_mongodb(wrote_num) self.write_txt(wrote_num) def get_weibo_info(self): From d361716180ebb6440493d6cfcd1e1b268b985a0a Mon Sep 17 00:00:00 2001 From: chenlei Date: Sun, 1 Sep 2019 22:54:20 +0800 Subject: [PATCH 061/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=B0=86?= =?UTF-8?q?=E7=88=AC=E5=8F=96=E4=BF=A1=E6=81=AF=E5=86=99=E5=85=A5MySQL?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=BA=93=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 61 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/weiboSpider.py b/weiboSpider.py index 18773993..04780a19 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -14,7 +14,6 @@ import requests from lxml import etree -# from pymongo import MongoClient from requests.adapters import HTTPAdapter from tqdm import tqdm @@ -611,18 +610,78 @@ def write_txt(self, wrote_num): # """将爬取的信息写入MongoDB数据库""" # # 如果想使用此功能,请先确保已安装pymongo # # 若未安装请运行pip install pymongo + # from pymongo import MongoClient + # client = MongoClient() # db = client['weibo'] # collection = db['weibo'] # for w in self.weibo[wrote_num:]: # if not collection.find_one({'id': w['id']}): # collection.insert_one(w) + # print(u'%d条微博写入MongoDB数据库文件完毕' % self.got_num) + + # def write_mysql(self, wrote_num): + # """将爬取的信息写入MySQL数据库""" + # # 如果想使用此功能,请先确保已安装pymysql + # # 若未安装请运行pip install pymysql + # import pymysql + + # db = pymysql.connect(host='localhost', + # user='root', + # password='123456', + # port=3306) + # cursor = db.cursor() + # cursor.execute( + # 'CREATE DATABASE IF NOT EXISTS weibo DEFAULT CHARACTER SET utf8mb4' + # ) + # db.close() + # db1 = pymysql.connect(host='localhost', + # user='root', + # password='123456', + # port=3306, + # db='weibo') + # cursor1 = db1.cursor() + # cursor1.execute(''' + # CREATE TABLE IF NOT EXISTS weibo ( + # id varchar(10) NOT NULL, + # content varchar(2000), + # original_pictures varchar(1000), + # retweet_pictures varchar(1000), + # original BOOLEAN NOT NULL DEFAULT 1, + # video_url varchar(300), + # publish_place varchar(100), + # publish_time DATETIME NOT NULL, + # publish_tool varchar(30), + # up_num INT NOT NULL, + # retweet_num INT NOT NULL, + # comment_num INT NOT NULL, + # PRIMARY KEY (id) + # ) + # ''') + # for w in self.weibo[wrote_num:]: + # table = 'weibo' + # keys = ', '.join(w.keys()) + # values = ', '.join(['%s'] * len(w)) + # sql = 'INSERT INTO {table}({keys}) VALUES ({values}) ON DUPLICATE KEY UPDATE'.format( + # table=table, keys=keys, values=values) + # update = ','.join([" {key} = %s".format(key=key) for key in w]) + # sql += update + # try: + # cursor1.execute(sql, tuple(w.values()) * 2) + # db1.commit() + # except Exception as e: + # db1.rollback() + # print('Error: ', e) + # traceback.print_exc() + # db1.close() + # print(u'%d条微博写入MySQL数据库文件完毕' % self.got_num) def write_file(self, wrote_num): """写文件""" if self.got_num > wrote_num: self.write_csv(wrote_num) # self.write_mongodb(wrote_num) + # self.write_mysql(wrote_num) self.write_txt(wrote_num) def get_weibo_info(self): From ef0b7e25ccaff5a9e92af1cf65ef4da724c3a6da Mon Sep 17 00:00:00 2001 From: chenlei Date: Mon, 2 Sep 2019 18:52:28 +0800 Subject: [PATCH 062/363] =?UTF-8?q?feat:=20=E4=B8=BA=E5=86=99=E5=85=A5Mong?= =?UTF-8?q?oDB=E6=95=B0=E6=8D=AE=E5=BA=93=E6=B7=BB=E5=8A=A0=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E6=9B=B4=E6=96=B0=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 04780a19..3840900c 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -618,6 +618,8 @@ def write_txt(self, wrote_num): # for w in self.weibo[wrote_num:]: # if not collection.find_one({'id': w['id']}): # collection.insert_one(w) + # else: + # collection.update_one({'id': w['id']}, {'$set': w}) # print(u'%d条微博写入MongoDB数据库文件完毕' % self.got_num) # def write_mysql(self, wrote_num): @@ -662,8 +664,10 @@ def write_txt(self, wrote_num): # table = 'weibo' # keys = ', '.join(w.keys()) # values = ', '.join(['%s'] * len(w)) - # sql = 'INSERT INTO {table}({keys}) VALUES ({values}) ON DUPLICATE KEY UPDATE'.format( - # table=table, keys=keys, values=values) + # sql = '''INSERT INTO {table}({keys}) VALUES ({values}) + # ON DUPLICATE KEY UPDATE'''.format(table=table, + # keys=keys, + # values=values) # update = ','.join([" {key} = %s".format(key=key) for key in w]) # sql += update # try: @@ -680,8 +684,8 @@ def write_file(self, wrote_num): """写文件""" if self.got_num > wrote_num: self.write_csv(wrote_num) - # self.write_mongodb(wrote_num) # self.write_mysql(wrote_num) + # self.write_mongodb(wrote_num) self.write_txt(wrote_num) def get_weibo_info(self): From 8943b28ccafc2ee520192cf10d2b5c1f599e2242 Mon Sep 17 00:00:00 2001 From: chenlei Date: Mon, 2 Sep 2019 20:42:33 +0800 Subject: [PATCH 063/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E5=BA=93=E5=86=99=E5=85=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 185 ++++++++++++++++++++++++++----------------------- 1 file changed, 98 insertions(+), 87 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 3840900c..34e8be7d 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -24,6 +24,8 @@ class Weibo(object): def __init__(self, filter=0, since_date='1900-01-01', + mongodb_write=0, + mysql_write=0, pic_download=0, video_download=0): """Weibo类初始化""" @@ -31,6 +33,10 @@ def __init__(self, sys.exit(u'filter值应为0或1,请重新输入') if not self.is_date(since_date): sys.exit(u'since_date值应为yyyy-mm-dd形式,请重新输入') + if mongodb_write != 0 and mongodb_write != 1: + sys.exit(u'mongodb_write值应为0或1,请重新输入') + if mysql_write != 0 and mysql_write != 1: + sys.exit(u'mysql_write值应为0或1,请重新输入') if pic_download != 0 and pic_download != 1: sys.exit(u'pic_download值应为0或1,请重新输入') if video_download != 0 and video_download != 1: @@ -38,6 +44,8 @@ def __init__(self, self.user_id = '' # 用户id,如昵称为"Dear-迪丽热巴"的id为'1669879400' self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 self.since_date = since_date # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd + self.mongodb_write = mongodb_write # 值为0代表不将结果写入MongoDB数据库,1代表写入 + self.mysql_write = mysql_write # 值为0代表不将结果写入MySQL数据库,1代表写入 self.pic_download = pic_download # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 self.video_download = video_download # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 self.nickname = '' # 用户昵称,如“Dear-迪丽热巴” @@ -606,87 +614,85 @@ def write_txt(self, wrote_num): print('Error: ', e) traceback.print_exc() - # def write_mongodb(self, wrote_num): - # """将爬取的信息写入MongoDB数据库""" - # # 如果想使用此功能,请先确保已安装pymongo - # # 若未安装请运行pip install pymongo - # from pymongo import MongoClient - - # client = MongoClient() - # db = client['weibo'] - # collection = db['weibo'] - # for w in self.weibo[wrote_num:]: - # if not collection.find_one({'id': w['id']}): - # collection.insert_one(w) - # else: - # collection.update_one({'id': w['id']}, {'$set': w}) - # print(u'%d条微博写入MongoDB数据库文件完毕' % self.got_num) - - # def write_mysql(self, wrote_num): - # """将爬取的信息写入MySQL数据库""" - # # 如果想使用此功能,请先确保已安装pymysql - # # 若未安装请运行pip install pymysql - # import pymysql - - # db = pymysql.connect(host='localhost', - # user='root', - # password='123456', - # port=3306) - # cursor = db.cursor() - # cursor.execute( - # 'CREATE DATABASE IF NOT EXISTS weibo DEFAULT CHARACTER SET utf8mb4' - # ) - # db.close() - # db1 = pymysql.connect(host='localhost', - # user='root', - # password='123456', - # port=3306, - # db='weibo') - # cursor1 = db1.cursor() - # cursor1.execute(''' - # CREATE TABLE IF NOT EXISTS weibo ( - # id varchar(10) NOT NULL, - # content varchar(2000), - # original_pictures varchar(1000), - # retweet_pictures varchar(1000), - # original BOOLEAN NOT NULL DEFAULT 1, - # video_url varchar(300), - # publish_place varchar(100), - # publish_time DATETIME NOT NULL, - # publish_tool varchar(30), - # up_num INT NOT NULL, - # retweet_num INT NOT NULL, - # comment_num INT NOT NULL, - # PRIMARY KEY (id) - # ) - # ''') - # for w in self.weibo[wrote_num:]: - # table = 'weibo' - # keys = ', '.join(w.keys()) - # values = ', '.join(['%s'] * len(w)) - # sql = '''INSERT INTO {table}({keys}) VALUES ({values}) - # ON DUPLICATE KEY UPDATE'''.format(table=table, - # keys=keys, - # values=values) - # update = ','.join([" {key} = %s".format(key=key) for key in w]) - # sql += update - # try: - # cursor1.execute(sql, tuple(w.values()) * 2) - # db1.commit() - # except Exception as e: - # db1.rollback() - # print('Error: ', e) - # traceback.print_exc() - # db1.close() - # print(u'%d条微博写入MySQL数据库文件完毕' % self.got_num) - - def write_file(self, wrote_num): - """写文件""" + def write_mongodb(self, wrote_num): + """将爬取的信息写入MongoDB数据库""" + from pymongo import MongoClient + + client = MongoClient() + db = client['weibo'] + collection = db['weibo'] + for w in self.weibo[wrote_num:]: + if not collection.find_one({'id': w['id']}): + collection.insert_one(w) + else: + collection.update_one({'id': w['id']}, {'$set': w}) + print(u'%d条微博写入MongoDB数据库文件完毕' % self.got_num) + + def write_mysql(self, wrote_num): + """将爬取的信息写入MySQL数据库""" + import pymysql + + db = pymysql.connect(host='localhost', + user='root', + password='123456', + port=3306) + cursor = db.cursor() + cursor.execute( + 'CREATE DATABASE IF NOT EXISTS weibo DEFAULT CHARACTER SET utf8mb4' + ) + db.close() + db1 = pymysql.connect(host='localhost', + user='root', + password='123456', + port=3306, + db='weibo') + cursor1 = db1.cursor() + cursor1.execute(""" + CREATE TABLE IF NOT EXISTS weibo ( + id varchar(10) NOT NULL, + content varchar(2000), + original_pictures varchar(1000), + retweet_pictures varchar(1000), + original BOOLEAN NOT NULL DEFAULT 1, + video_url varchar(300), + publish_place varchar(100), + publish_time DATETIME NOT NULL, + publish_tool varchar(30), + up_num INT NOT NULL, + retweet_num INT NOT NULL, + comment_num INT NOT NULL, + PRIMARY KEY (id) + ) + """) + for w in self.weibo[wrote_num:]: + table = 'weibo' + keys = ', '.join(w.keys()) + values = ', '.join(['%s'] * len(w)) + sql = """INSERT INTO {table}({keys}) VALUES ({values}) + ON DUPLICATE KEY UPDATE""".format(table=table, + keys=keys, + values=values) + update = ','.join([" {key} = %s".format(key=key) for key in w]) + sql += update + try: + cursor1.execute(sql, tuple(w.values()) * 2) + db1.commit() + except Exception as e: + db1.rollback() + print('Error: ', e) + traceback.print_exc() + db1.close() + print(u'%d条微博写入MySQL数据库文件完毕' % self.got_num) + + def write_data(self, wrote_num): + """将爬取到的信息写入文件或数据库""" if self.got_num > wrote_num: self.write_csv(wrote_num) - # self.write_mysql(wrote_num) - # self.write_mongodb(wrote_num) self.write_txt(wrote_num) + if self.mysql_write: + self.write_mysql(wrote_num) + if self.mongodb_write: + self.write_mongodb(wrote_num) def get_weibo_info(self): """获取微博信息""" @@ -704,7 +710,7 @@ def get_weibo_info(self): break if page % 20 == 0: # 每爬20页写入一次文件 - self.write_file(wrote_num) + self.write_data(wrote_num) wrote_num = self.got_num # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限 @@ -715,7 +721,7 @@ def get_weibo_info(self): page1 = page random_pages = random.randint(1, 5) - self.write_file(wrote_num) # 将剩余不足20页的微博写入文件 + self.write_data(wrote_num) # 将剩余不足20页的微博写入文件 if not self.filter: print(u'共爬取' + str(self.got_num) + u'条微博') else: @@ -762,20 +768,25 @@ def main(): # 以下是程序配置信息,可以根据自己需求修改 filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 since_date = '2018-01-01' # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd + """值为0代表不将结果写入MongoDB数据库,1代表写入;若要写入MongoDB数据库, + 请先安装MongoDB数据库和pymongo,pymongo安装方法为命令行运行:pip install pymongo""" + mongodb_write = 0 + """值为0代表不将结果写入MySQL数据库,1代表写入;若要写入MySQL数据库, + 请先安装MySQL数据库和pymysql,pymysql安装方法为命令行运行:pip install pymysql""" + mysql_write = 0 pic_download = 1 # 值为0代表不下载微博原始图片,1代表下载微博原始图片 video_download = 1 # 值为0代表不下载微博视频,1代表下载微博视频 - wb = Weibo(filter, since_date, pic_download, video_download) - - # user_id_list包含了要爬的目标微博id,可以是一个,也可以是多个,也可以从文件中读取 - # 爬单个微博,user_id_list如下所示,可以改成任意合法的用户id + wb = Weibo(filter, since_date, mongodb_write, mysql_write, + pic_download, video_download) + """user_id_list包含了要爬的目标微博id,可以是一个,也可以是多个,也可以从文件中读取 + 爬单个微博,user_id_list如下所示,可以改成任意合法的用户id""" user_id_list = ['1669879400'] # 爬多个微博,user_id_list如下所示,可以改成任意合法的用户id # user_id_list = ['1669879400', '1729370543'] - - # 也可以在文件中读取,文件中可以包含很多user_id,每个user_id占一行,文件名任意,类型为txt,位置位于本程序的同目录下, - # 比如文件可以叫user_id_list.txt,读取文件中的user_id_list如下所示: + """也可以在文件中读取user_id_list,文件中可以包含很多user_id,每个user_id占一行,文件名任意,类型为txt,位置位于本程序的同目录下, + 比如文件可以叫user_id_list.txt,读取文件中的user_id_list如下所示:""" # user_id_list = wb.get_user_list('user_id_list.txt') wb.start(user_id_list) # 爬取微博信息 From 6dd6a58abff1171ed75120c1e38e7dd35145f7a1 Mon Sep 17 00:00:00 2001 From: chenlei Date: Sun, 8 Sep 2019 19:28:03 +0800 Subject: [PATCH 064/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E7=88=AC?= =?UTF-8?q?=E5=8F=96=E4=BF=A1=E6=81=AF=E5=86=99=E5=85=A5MySQL=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E5=BA=93=E8=BF=87=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 154 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 107 insertions(+), 47 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 34e8be7d..7a672ac7 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -54,6 +54,8 @@ def __init__(self, self.following = 0 # 用户关注数 self.followers = 0 # 用户粉丝数 self.weibo = [] # 存储爬取到的所有微博信息 + self.mysql_config = { + } # MySQL数据库连接配置,可以不填,当使用者的mysql用户名、密码等与本程序默认值不同时,需要通过mysql_config来自定义 def is_date(self, since_date): """判断日期格式是否正确""" @@ -626,28 +628,86 @@ def write_mongodb(self, wrote_num): collection.insert_one(w) else: collection.update_one({'id': w['id']}, {'$set': w}) - print(u'%d条微博写入MongoDB数据库文件完毕' % self.got_num) + print(u'%d条微博写入MongoDB数据库完毕' % self.got_num) - def write_mysql(self, wrote_num): - """将爬取的信息写入MySQL数据库""" + def change_mysql_config(self, mysql_config): + """修改MySQL数据库连接配置""" + self.mysql_config = mysql_config + + def mysql_create(self, connection, sql): + """创建MySQL数据库或表""" + try: + with connection.cursor() as cursor: + cursor.execute(sql) + finally: + connection.close() + + def mysql_create_database(self, mysql_config, sql): + """创建MySQL数据库""" + import pymysql + + if self.mysql_config: + mysql_config = self.mysql_config + connection = pymysql.connect(**mysql_config) + self.mysql_create(connection, sql) + + def mysql_create_table(self, mysql_config, sql): + """创建MySQL表""" + import pymysql + + if self.mysql_config: + mysql_config = self.mysql_config + mysql_config['db'] = 'weibo' + connection = pymysql.connect(**mysql_config) + self.mysql_create(connection, sql) + + def mysql_insert(self, mysql_config, table, data_list): + """向MySQL表插入或更新数据""" import pymysql - db = pymysql.connect(host='localhost', - user='root', - password='123456', - port=3306) - cursor = db.cursor() - cursor.execute( - 'CREATE DATABASE IF NOT EXISTS weibo DEFAULT CHARACTER SET utf8mb4' - ) - db.close() - db1 = pymysql.connect(host='localhost', - user='root', - password='123456', - port=3306, - db='weibo') - cursor1 = db1.cursor() - cursor1.execute(""" + if len(data_list) > 0: + keys = ', '.join(data_list[0].keys()) + values = ', '.join(['%s'] * len(data_list[0])) + if self.mysql_config: + mysql_config = self.mysql_config + mysql_config['db'] = 'weibo' + connection = pymysql.connect(**mysql_config) + cursor = connection.cursor() + sql = """INSERT INTO {table}({keys}) VALUES ({values}) ON + DUPLICATE KEY UPDATE""".format(table=table, + keys=keys, + values=values) + update = ','.join([ + " {key} = values({key})".format(key=key) + for key in data_list[0] + ]) + sql += update + try: + cursor.executemany( + sql, [tuple(data.values()) for data in data_list]) + connection.commit() + except Exception as e: + connection.rollback() + print('Error: ', e) + traceback.print_exc() + finally: + connection.close() + + def write_mysql(self, wrote_num): + """将爬取的信息写入MySQL数据库""" + mysql_config = { + 'host': 'localhost', + 'port': 3306, + 'user': 'root', + 'password': '123456', + 'charset': 'utf8mb4' + } + # 创建'weibo'数据库 + create_database = """CREATE DATABASE IF NOT EXISTS weibo DEFAULT + CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci""" + self.mysql_create_database(mysql_config, create_database) + # 创建'weibo'表 + create_table = """ CREATE TABLE IF NOT EXISTS weibo ( id varchar(10) NOT NULL, content varchar(2000), @@ -662,27 +722,11 @@ def write_mysql(self, wrote_num): retweet_num INT NOT NULL, comment_num INT NOT NULL, PRIMARY KEY (id) - ) - """) - for w in self.weibo[wrote_num:]: - table = 'weibo' - keys = ', '.join(w.keys()) - values = ', '.join(['%s'] * len(w)) - sql = """INSERT INTO {table}({keys}) VALUES ({values}) - ON DUPLICATE KEY UPDATE""".format(table=table, - keys=keys, - values=values) - update = ','.join([" {key} = %s".format(key=key) for key in w]) - sql += update - try: - cursor1.execute(sql, tuple(w.values()) * 2) - db1.commit() - except Exception as e: - db1.rollback() - print('Error: ', e) - traceback.print_exc() - db1.close() - print(u'%d条微博写入MySQL数据库文件完毕' % self.got_num) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" + self.mysql_create_table(mysql_config, create_table) + # 在'weibo'表中插入或更新微博数据 + self.mysql_insert(mysql_config, 'weibo', self.weibo[wrote_num:]) + print(u'%d条微博写入MySQL数据库完毕' % self.got_num) def write_data(self, wrote_num): """将爬取到的信息写入文件或数据库""" @@ -779,15 +823,31 @@ def main(): wb = Weibo(filter, since_date, mongodb_write, mysql_write, pic_download, video_download) + + # 下面是自定义MySQL数据库连接配置(可选) + """因为操作MySQL数据库需要用户名、密码等参数,本程序默认为: + mysql_config = { + 'host': 'localhost', + 'port': 3306, + 'user': 'root', + 'password': '123456', + 'charset': 'utf8mb4' + } + 大家的参数配置如果和默认值不同,可以将上面的参数值替换成自己的, + 然后添加如下代码,使修改生效,如果你的参数和默认值相同则不需要下面的代码: + wb.change_mysql_config(mysql_config)""" + + # 下面是配置user_id_list """user_id_list包含了要爬的目标微博id,可以是一个,也可以是多个,也可以从文件中读取 - 爬单个微博,user_id_list如下所示,可以改成任意合法的用户id""" + 爬单个微博,user_id_list如下所示,可以改成任意合法的用户id + user_id_list = ['1669879400'] + 爬多个微博,user_id_list如下所示,可以改成任意合法的用户id + user_id_list = ['1669879400', '1729370543'] + 也可以在文件中读取user_id_list,文件中可以包含很多user_id, + 每个user_id占一行,文件名任意,类型为txt,位置位于本程序的同目录下, + 比如文件可以叫user_id_list.txt,读取文件中的user_id_list如下所示: + user_id_list = wb.get_user_list('user_id_list.txt')""" user_id_list = ['1669879400'] - - # 爬多个微博,user_id_list如下所示,可以改成任意合法的用户id - # user_id_list = ['1669879400', '1729370543'] - """也可以在文件中读取user_id_list,文件中可以包含很多user_id,每个user_id占一行,文件名任意,类型为txt,位置位于本程序的同目录下, - 比如文件可以叫user_id_list.txt,读取文件中的user_id_list如下所示:""" - # user_id_list = wb.get_user_list('user_id_list.txt') wb.start(user_id_list) # 爬取微博信息 except Exception as e: From 6658da37d246e1d508d348941b4dab96b41c03f8 Mon Sep 17 00:00:00 2001 From: chenlei Date: Mon, 9 Sep 2019 22:07:28 +0800 Subject: [PATCH 065/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=9B=A0?= =?UTF-8?q?=E7=BD=AE=E9=A1=B6=E5=BE=AE=E5=8D=9A=E5=AF=BC=E8=87=B4=E9=83=A8?= =?UTF-8?q?=E5=88=86=E5=BE=AE=E5=8D=9A=E6=97=A0=E6=B3=95=E6=8C=89=E6=97=B6?= =?UTF-8?q?=E9=97=B4=E7=AD=9B=E9=80=89=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 7a672ac7..fcd85824 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -503,6 +503,14 @@ def print_one_weibo(self, weibo): print(u'转发数:%d' % weibo['retweet_num']) print(u'评论数:%d' % weibo['comment_num']) + def is_pinned_weibo(self, info): + """判断微博是否为置顶微博""" + kt = info.xpath(".//span[@class='kt']/text()") + if kt and kt[0] == u'置顶': + return True + else: + return False + def get_one_page(self, page): """获取第page页的全部微博""" try: @@ -514,8 +522,15 @@ def get_one_page(self, page): for i in range(0, len(info) - 2): weibo = self.get_one_weibo(info[i]) if weibo: - if weibo['publish_time'] < self.since_date: - return True + publish_time = datetime.strptime( + weibo['publish_time'][:10], "%Y-%m-%d") + since_date = datetime.strptime(self.since_date, + "%Y-%m-%d") + if publish_time < since_date: + if self.is_pinned_weibo(info[i]): + continue + else: + return True self.print_one_weibo(weibo) self.weibo.append(weibo) self.got_num += 1 From 0d1f9782ee7c6914aa2845fac0f7a4485d6f2cce Mon Sep 17 00:00:00 2001 From: chenlei Date: Mon, 9 Sep 2019 23:49:34 +0800 Subject: [PATCH 066/363] =?UTF-8?q?feat:=20=E4=B8=BA=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E5=BA=93'weibo'=E8=A1=A8=E6=B7=BB=E5=8A=A0user=5Fid=E5=AD=97?= =?UTF-8?q?=E6=AE=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/weiboSpider.py b/weiboSpider.py index fcd85824..5ceb1635 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -639,6 +639,7 @@ def write_mongodb(self, wrote_num): db = client['weibo'] collection = db['weibo'] for w in self.weibo[wrote_num:]: + w['user_id'] = self.user_id if not collection.find_one({'id': w['id']}): collection.insert_one(w) else: @@ -725,6 +726,7 @@ def write_mysql(self, wrote_num): create_table = """ CREATE TABLE IF NOT EXISTS weibo ( id varchar(10) NOT NULL, + user_id varchar(12), content varchar(2000), original_pictures varchar(1000), retweet_pictures varchar(1000), @@ -740,7 +742,11 @@ def write_mysql(self, wrote_num): ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" self.mysql_create_table(mysql_config, create_table) # 在'weibo'表中插入或更新微博数据 - self.mysql_insert(mysql_config, 'weibo', self.weibo[wrote_num:]) + weibo_list = [] + for weibo in self.weibo[wrote_num:]: + weibo['user_id'] = self.user_id + weibo_list.append(weibo) + self.mysql_insert(mysql_config, 'weibo', weibo_list) print(u'%d条微博写入MySQL数据库完毕' % self.got_num) def write_data(self, wrote_num): From 1ffccf03b9f169c03bac8334886ae477db1bce06 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 11 Sep 2019 03:49:12 +0800 Subject: [PATCH 067/363] Update README.md --- README.md | 152 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 105 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index 80712bb4..2808b846 100644 --- a/README.md +++ b/README.md @@ -1,47 +1,35 @@ * [功能](#功能) -* [输入](#输入) * [输出](#输出) +* [实例](#实例) * [运行环境](#运行环境) * [使用说明](#使用说明) * [下载脚本](#1下载脚本) * [安装依赖](#2安装依赖) - * [设置cookie和user_id](#3设置cookie和user_id) - * [运行脚本](#4运行脚本) - * [按需求修改脚本](#5按需求修改脚本可选) + * [设置cookie](#3设置cookie) + * [设置user_id](#4设置user_id) + * [运行脚本](#5运行脚本) + * [按需求修改脚本](#6按需求修改脚本可选) * [如何获取cookie](#如何获取cookie) * [如何获取user_id](#如何获取user_id) * [注意事项](#注意事项) -# 功能 -爬取新浪微博信息,并写入csv/txt文件,文件名为目标用户id加".csv"和".txt"的形式,同时还会下载该微博原始图片和微博视频(可选)。
-
-本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需免cookie版,大家可以访问, -二者功能类似,免cookie版因为不需要cookie,用法更简单,但功能却更多。
-
-以爬取迪丽热巴的微博为例,她的微博昵称为"Dear-迪丽热巴",id为1669879400(后面会讲如何获取用户id)。我们选择爬取她的原创微博。程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。
-
-csv文件结果如下所示: -![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv*
-
-txt文件结果如下所示: -![](https://picture.cognize.me/cognize/github/weibospider/weibo_txt.png)*1669879400.txt*
-
-下载的图片如下所示: -![](https://picture.cognize.me/cognize/github/weibospider/img.png)*img文件夹*
-本次下载了793张图片,大小一共1.21GB,包括她原创微博中的图片和转发微博转发理由中的图片。图片名为yyyymmdd+微博id的形式,若某条微博存在多张图片,则图片名中还会包括它在微博图片中的序号。若某张图片因为网络等原因下载失败,程序则会以“weibo_id:pic_url”的形式将出错微博id和图片url写入同文件夹下的not_downloaded.txt里;
-
-下载的视频如下所示: -![](https://picture.cognize.me/cognize/github/weibospider/video.png)*video文件夹*
-本次下载了70个视频,是她原创微博中的视频,视频名为yyyymmdd+微博id的形式。其中有一个视频因为网络原因下载失败,程序将它的微博id和视频url以“weibo_id:video_url”的形式写到了同文件夹下的not_downloaded.txt里。 - -# 输入 -用户id,例如新浪微博昵称为"Dear-迪丽热巴"的id为"1669879400" - -# 输出 +## 功能 +爬取**一个**或**多个**新浪微博用户的数据,并将结果信息写入文件。写入信息几乎包括了用户微博的所有数据,主要包含用户信息和微博信息两类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[输出](#输出)部分。具体的写入文件类型如下: +- 写入**txt文件**(默认) +- 写入**csv文件**(默认) +- 写入**MySQL数据库**(可选) +- 写入**MongoDB数据库**(可选) +- 下载用户微博中的原始**图片**(默认) +- 下载用户微博中的**视频**(默认) +本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需免cookie版,大家可以访问,二者功能类似,免cookie版获取的信息更多,用法更简单,而且不需要cookie。
+## 输出 +**用户信息** - 昵称:用户昵称,如"Dear-迪丽热巴" - 微博数:用户的全部微博数(转发微博+原创微博) - 关注数:用户关注的微博数量 - 粉丝数:用户的粉丝数 +*** +**微博信息** - 微博id:微博唯一标志 - 微博内容:微博正文 - 原始图片url:原创微博图片和转发微博转发理由中图片的url,若某条微博存在多张图片,每个url以英文逗号分隔,若没有图片则值为无 @@ -54,40 +42,110 @@ txt文件结果如下所示: - 结果文件:保存在当前目录weibo文件夹下以用户昵称为名的文件夹里,名字为"user_id.csv"和"user_id.txt"的形式 - 微博图片:原创微博中的图片和转发微博转发理由中的图片,保存在以用户昵称为名的文件夹下的img文件夹里 - 微博视频:原创微博中的视频,保存在以用户昵称为名的文件夹下的video文件夹里 +
+## 实例 +以爬取迪丽热巴的微博为例,她的微博昵称为"Dear-迪丽热巴",id为1669879400(后面会讲如何获取用户id)。我们选择爬取她的原创微博。程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见数据库部分。
+
+csv文件结果如下所示: +![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv*
+
+txt文件结果如下所示: +![](https://picture.cognize.me/cognize/github/weibospider/weibo_txt.png)*1669879400.txt*
+
+下载的图片如下所示: +![](https://picture.cognize.me/cognize/github/weibospider/img.png)*img文件夹*
+本次下载了793张图片,大小一共1.21GB,包括她原创微博中的图片和转发微博转发理由中的图片。图片名为yyyymmdd+微博id的形式,若某条微博存在多张图片,则图片名中还会包括它在微博图片中的序号。若某张图片因为网络等原因下载失败,程序则会以“weibo_id:pic_url”的形式将出错微博id和图片url写入同文件夹下的not_downloaded.txt里;
+
+下载的视频如下所示: +![](https://picture.cognize.me/cognize/github/weibospider/video.png)*video文件夹*
+本次下载了70个视频,是她原创微博中的视频,视频名为yyyymmdd+微博id的形式。其中有一个视频因为网络原因下载失败,程序将它的微博id和视频url以“weibo_id:video_url”的形式写到了同文件夹下的not_downloaded.txt里。 -# 运行环境 +## 运行环境 - 开发语言:python2/python3 - 系统: Windows/Linux/macOS -# 使用说明 -## 1.下载脚本 +## 使用说明 +### 1.下载脚本 ```bash $ git clone https://github.com/dataabc/weibospider.git ``` 运行上述命令,将本项目下载到当前目录,如果下载成功当前目录会出现一个名为"weibospider"的文件夹; -## 2.安装依赖 +### 2.安装依赖 ```bash pip install -r requirements.txt ``` -## 3.设置cookie和user_id -打开weibospider文件夹下的"**weibospider.py**"文件,将"**your cookie**"替换成爬虫微博的cookie,后面会详细讲解如何获取cookie;将**user_id**替换成想要爬取的微博的user_id,后面会详细讲解如何获取user_id; -## 4.运行脚本 +### 3.设置cookie +打开weibospider文件夹下的"**weibospider.py**"文件,将"**your cookie**"替换成爬虫微博的cookie,cookie获取方法见[如何获取cookie](#如何获取cookie); +### 4.设置user_id +打开weibospider文件夹下的"**weibospider.py**"文件,将我们想要爬取的**一个**或**多个**微博的user_id赋值给user_id_list,user_id获取方法见[如何获取user_id](#如何获取user_id); +user_id设置代码位于**weibospider.py**的main函数里,具体代码如下: +```python +# 爬单个微博用户 +user_id_list = ['1669879400'] +``` +```python +# 爬多个微博用户 +user_id_list = ['1669879400', '1729370543'] +``` +```python +"""也可以在文件中读取user_id_list,文件中可以包含很多user_id, +每个user_id占一行,文件名任意,类型为txt,位置位于本程序的同目录下, +比如文件可以叫user_id_list.txt""" +user_id_list = wb.get_user_list('user_id_list.txt') +``` +### 5.运行脚本 大家可以根据自己的运行环境选择运行方式,Linux可以通过 ```bash $ python weibospider.py ``` 运行; -## 5.按需求修改脚本(可选) +### 6.按需求修改脚本(可选) 本脚本是一个Weibo类,用户可以按照自己的需求调用Weibo类。 例如用户可以直接在"weibospider.py"文件中调用Weibo类,具体调用代码示例如下: ```python -user_id = 1669879400 -filter = 1 -pic_download = 1 -wb = Weibo(user_id, filter, pic_download) #调用Weibo类,创建微博实例wb -wb.start() #爬取微博信息 + # 以下是程序配置信息,可以根据自己需求修改 + filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 + since_date = '2019-07-01' # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd + """值为0代表不将结果写入MongoDB数据库,1代表写入;若要写入MongoDB数据库, + 请先安装MongoDB数据库和pymongo,pymongo安装方法为命令行运行:pip install pymongo""" + mongodb_write = 1 + """值为0代表不将结果写入MySQL数据库,1代表写入;若要写入MySQL数据库, + 请先安装MySQL数据库和pymysql,pymysql安装方法为命令行运行:pip install pymysql""" + mysql_write = 1 + pic_download = 0 # 值为0代表不下载微博原始图片,1代表下载微博原始图片 + video_download = 0 # 值为0代表不下载微博视频,1代表下载微博视频 + + wb = Weibo(filter, since_date, mongodb_write, mysql_write, + pic_download, video_download) + + # 下面是自定义MySQL数据库连接配置(可选) + """因为操作MySQL数据库需要用户名、密码等参数,本程序默认为: + mysql_config = { + 'host': 'localhost', + 'port': 3306, + 'user': 'root', + 'password': '123456', + 'charset': 'utf8mb4' + } + 大家的参数配置如果和默认值不同,可以将上面的参数值替换成自己的, + 然后添加如下代码,使修改生效,如果你的参数和默认值相同则不需要下面的代码: + wb.change_mysql_config(mysql_config)""" + + # 下面是配置user_id_list + """user_id_list包含了要爬的目标微博id,可以是一个,也可以是多个,也可以从文件中读取 + 爬单个微博,user_id_list如下所示,可以改成任意合法的用户id + user_id_list = ['1669879400'] + 爬多个微博,user_id_list如下所示,可以改成任意合法的用户id + user_id_list = ['1669879400', '1729370543'] + 也可以在文件中读取user_id_list,文件中可以包含很多user_id, + 每个user_id占一行,文件名任意,类型为txt,位置位于本程序的同目录下, + 比如文件可以叫user_id_list.txt,读取文件中的user_id_list如下所示: + user_id_list = wb.get_user_list('user_id_list.txt')""" + user_id_list = ['1669879400', '1729370543'] + + wb.start(user_id_list) # 爬取微博信息 ``` -user_id可以改成任意合法的用户id(爬虫的微博id除外);filter默认值为0,表示爬取所有微博信息(转发微博+原创微博),为1表示只爬取用户的所有原创微博;pic_download默认值为0,代表不下载微博原始图片,1代表下载;wb是Weibo类的一个实例,也可以是其它名字,只要符合python的命名规范即可;通过执行wb.start() 完成了微博的爬取工作。在上述代码执行后,我们可以得到很多信息:
+通过执行wb.start() 完成了微博的爬取工作。在上述代码执行后,我们可以得到很多信息:
**wb.nickname**:用户昵称;
**wb.weibo_num**:微博数;
**wb.following**:关注数;
@@ -111,7 +169,7 @@ user_id可以改成任意合法的用户id(爬虫的微博id除外);filter -# 如何获取cookie +## 如何获取cookie 1.用Chrome打开
2.输入微博的用户名、密码,登录,如图所示: ![](https://picture.cognize.me/cognize/github/weibospider/cookie1.png) @@ -121,7 +179,7 @@ user_id可以改成任意合法的用户id(爬虫的微博id除外);filter 4.依此点击Chrome开发者工具中的Network->Name中的weibo.cn->Headers->Request Headers,"Cookie:"后的值即为我们要找的cookie值,复制即可,如图所示: ![](https://picture.cognize.me/cognize/github/weibospider/cookie3.png) -# 如何获取user_id +## 如何获取user_id 1.打开网址,搜索我们要找的人,如"迪丽热巴",进入她的主页;
![](https://picture.cognize.me/cognize/github/weibospider/user_home.png) 2.按照上图箭头所指,点击"资料"链接,跳转到用户资料页面;
@@ -129,6 +187,6 @@ user_id可以改成任意合法的用户id(爬虫的微博id除外);filter 如上图所示,迪丽热巴微博资料页的地址为"",其中的"1669879400"即为此微博的user_id。
事实上,此微博的user_id也包含在用户主页()中,之所以我们还要点击主页中的"资料"来获取user_id,是因为很多用户的主页不是""的形式,而是""或""的形式。其中"微号"和user_id都是一串数字,如果仅仅通过主页地址提取user_id,很容易将"微号"误认为user_id。 -# 注意事项 +## 注意事项 1.user_id不能为爬虫微博的user_id。因为要爬微博信息,必须先登录到某个微博账号,此账号我们姑且称为爬虫微博。爬虫微博访问自己的页面和访问其他用户的页面,得到的网页格式不同,所以无法爬取自己的微博信息;
2.cookie有期限限制,超过有效期需重新更新cookie。 From d9b86b7092e51b8a4018e88c7177e83bca753f10 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 11 Sep 2019 13:12:11 +0800 Subject: [PATCH 068/363] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2808b846..957ba30f 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ * [注意事项](#注意事项) ## 功能 -爬取**一个**或**多个**新浪微博用户的数据,并将结果信息写入文件。写入信息几乎包括了用户微博的所有数据,主要包含用户信息和微博信息两类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[输出](#输出)部分。具体的写入文件类型如下: +爬取**一个**或**多个**新浪微博用户的数据,并将结果信息写入文件。写入信息几乎包括了用户微博的所有数据,主要有用户信息和微博信息两大类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[输出](#输出)部分。具体的写入文件类型如下: - 写入**txt文件**(默认) - 写入**csv文件**(默认) - 写入**MySQL数据库**(可选) @@ -43,6 +43,7 @@ - 微博图片:原创微博中的图片和转发微博转发理由中的图片,保存在以用户昵称为名的文件夹下的img文件夹里 - 微博视频:原创微博中的视频,保存在以用户昵称为名的文件夹下的video文件夹里
+ ## 实例 以爬取迪丽热巴的微博为例,她的微博昵称为"Dear-迪丽热巴",id为1669879400(后面会讲如何获取用户id)。我们选择爬取她的原创微博。程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见数据库部分。

From 09e55fefae636aecfb9dd208ea8dc5b9b3ae88da Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 11 Sep 2019 13:38:53 +0800 Subject: [PATCH 069/363] Update README.md --- README.md | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 957ba30f..bd197973 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,32 @@
## 实例 -以爬取迪丽热巴的微博为例,她的微博昵称为"Dear-迪丽热巴",id为1669879400(后面会讲如何获取用户id)。我们选择爬取她的原创微博。程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见数据库部分。
+以爬取迪丽热巴的微博为例,她的微博昵称为"Dear-迪丽热巴",id为1669879400(用户id获取方法见[如何获取user_id](#如何获取user_id))。我们选择爬取她的全部原创微博。具体方法为将**weibospider.py**文件的main函数主要部分修改为如下代码: +```python + # 以下是程序配置信息,可以根据自己需求修改 + filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 + since_date = '1900-01-01' # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd + """值为0代表不将结果写入MongoDB数据库,1代表写入;若要写入MongoDB数据库, + 请先安装MongoDB数据库和pymongo,pymongo安装方法为命令行运行:pip install pymongo""" + mongodb_write = 0 + """值为0代表不将结果写入MySQL数据库,1代表写入;若要写入MySQL数据库, + 请先安装MySQL数据库和pymysql,pymysql安装方法为命令行运行:pip install pymysql""" + mysql_write = 0 + pic_download = 0 # 值为0代表不下载微博原始图片,1代表下载微博原始图片 + video_download = 0 # 值为0代表不下载微博视频,1代表下载微博视频 + + wb = Weibo(filter, since_date, mongodb_write, mysql_write, + pic_download, video_download) + user_id_list = ['1669879400'] + + wb.start(user_id_list) # 爬取微博信息 +``` +具体代码含义注释里都有,因为我本地没有安装MySQL数据库和MongoDB数据库,所以暂时设置成不写入数据库,如果你想要将爬取结果写入数据库,只要先安装数据库,然后将mongodb_write或mysql_write值设置为1即可。写入MySQL需要用户名、密码等配置,这些配置如何设置见数据库部分。 +设置完成后运行程序: +```bash +$ python weibospider.py +``` +程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见数据库部分。

csv文件结果如下所示: ![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv*
From b01c89141cae68186ac191cb437a8177f4bb0674 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 11 Sep 2019 19:59:51 +0800 Subject: [PATCH 070/363] Update README.md --- README.md | 87 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 70 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index bd197973..7be2741d 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,9 @@ * [安装依赖](#2安装依赖) * [设置cookie](#3设置cookie) * [设置user_id](#4设置user_id) - * [运行脚本](#5运行脚本) - * [按需求修改脚本](#6按需求修改脚本可选) + * [设置数据库(可选)](#5设置数据库可选) + * [运行脚本](#6运行脚本) + * [按需求修改脚本(可选)](#7按需求修改脚本可选) * [如何获取cookie](#如何获取cookie) * [如何获取user_id](#如何获取user_id) * [注意事项](#注意事项) @@ -20,7 +21,7 @@ - 写入**MySQL数据库**(可选) - 写入**MongoDB数据库**(可选) - 下载用户微博中的原始**图片**(默认) -- 下载用户微博中的**视频**(默认) +- 下载用户微博中的**视频**(默认)
本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需免cookie版,大家可以访问,二者功能类似,免cookie版获取的信息更多,用法更简单,而且不需要cookie。
## 输出 **用户信息** @@ -45,7 +46,7 @@
## 实例 -以爬取迪丽热巴的微博为例,她的微博昵称为"Dear-迪丽热巴",id为1669879400(用户id获取方法见[如何获取user_id](#如何获取user_id))。我们选择爬取她的全部原创微博。具体方法为将**weibospider.py**文件的main函数主要部分修改为如下代码: +以爬取迪丽热巴的微博为例。首先,我们需要为程序设置cookie值,cookie获取及设置方法见[设置cookie](#3设置cookie)。迪丽热巴的微博昵称为"Dear-迪丽热巴",id为1669879400,用户id获取方法见[如何获取user_id](#如何获取user_id)。我们选择爬取她的全部原创微博。具体方法是将**weibospider.py**文件的main函数主要部分修改为如下代码: ```python # 以下是程序配置信息,可以根据自己需求修改 filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 @@ -65,12 +66,11 @@ wb.start(user_id_list) # 爬取微博信息 ``` -具体代码含义注释里都有,因为我本地没有安装MySQL数据库和MongoDB数据库,所以暂时设置成不写入数据库,如果你想要将爬取结果写入数据库,只要先安装数据库,然后将mongodb_write或mysql_write值设置为1即可。写入MySQL需要用户名、密码等配置,这些配置如何设置见数据库部分。 -设置完成后运行程序: +代码具体含义注释里都有,不在赘述。设置完成后运行程序: ```bash $ python weibospider.py ``` -程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见数据库部分。
+程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](#5设置数据库可选)部分。

csv文件结果如下所示: ![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv*
@@ -84,8 +84,8 @@ txt文件结果如下所示:
下载的视频如下所示: ![](https://picture.cognize.me/cognize/github/weibospider/video.png)*video文件夹*
-本次下载了70个视频,是她原创微博中的视频,视频名为yyyymmdd+微博id的形式。其中有一个视频因为网络原因下载失败,程序将它的微博id和视频url以“weibo_id:video_url”的形式写到了同文件夹下的not_downloaded.txt里。 - +本次下载了70个视频,是她原创微博中的视频,视频名为yyyymmdd+微博id的形式。其中有一个视频因为网络原因下载失败,程序将它的微博id和视频url以“weibo_id:video_url”的形式写到了同文件夹下的not_downloaded.txt里。
+因为我本地没有安装MySQL数据库和MongoDB数据库,所以暂时设置成不写入数据库。如果你想要将爬取结果写入数据库,只需要先安装数据库(MySQL或MongoDB),再安装对应包(pymysql或pymongo),然后将mysql_write或mongodb_write值设置为1即可。写入MySQL需要用户名、密码等配置信息,这些配置如何设置见[设置数据库](#5设置数据库可选)部分。 ## 运行环境 - 开发语言:python2/python3 - 系统: Windows/Linux/macOS @@ -98,36 +98,89 @@ $ git clone https://github.com/dataabc/weibospider.git 运行上述命令,将本项目下载到当前目录,如果下载成功当前目录会出现一个名为"weibospider"的文件夹; ### 2.安装依赖 ```bash -pip install -r requirements.txt +$ pip install -r requirements.txt ``` ### 3.设置cookie -打开weibospider文件夹下的"**weibospider.py**"文件,将"**your cookie**"替换成爬虫微博的cookie,cookie获取方法见[如何获取cookie](#如何获取cookie); +打开weibospider文件夹下的**weibospider.py**文件,将"**your cookie**"替换成爬虫微博的cookie,具体替换位置大约在**weibospider.py**文件的22行左右。cookie获取方法见[如何获取cookie](#如何获取cookie); ### 4.设置user_id -打开weibospider文件夹下的"**weibospider.py**"文件,将我们想要爬取的**一个**或**多个**微博的user_id赋值给user_id_list,user_id获取方法见[如何获取user_id](#如何获取user_id); +打开weibospider文件夹下的**weibospider.py**文件,将我们想要爬取的**一个**或**多个**微博的user_id赋值给user_id_list,user_id获取方法见[如何获取user_id](#如何获取user_id); user_id设置代码位于**weibospider.py**的main函数里,具体代码如下: ```python # 爬单个微博用户 user_id_list = ['1669879400'] ``` +或者 ```python # 爬多个微博用户 user_id_list = ['1669879400', '1729370543'] ``` +或者 ```python -"""也可以在文件中读取user_id_list,文件中可以包含很多user_id, +"""可以在文件中读取user_id_list,文件中可以包含很多user_id, 每个user_id占一行,文件名任意,类型为txt,位置位于本程序的同目录下, 比如文件可以叫user_id_list.txt""" user_id_list = wb.get_user_list('user_id_list.txt') ``` -### 5.运行脚本 +### 5.设置数据库(可选) +本部分是可选部分,如果不需要将爬取信息写入数据库,可跳过这一步。本程序目前支持MySQL数据库和MongoDB数据库,如果你需要写入其它数据库,可以参考这两个数据库的写法自己编写。
+**MySQL数据库写入**
+要想将爬取信息写入MySQL,请将main函数中的mysql_write变量值改为1。再根据自己的系统环境安装MySQL,然后命令行执行: +```bash +$ pip install pymysql +``` +MySQL写入需要主机、端口号、用户名、密码等配置,本程序默认的配置如下: +```python + mysql_config = { + 'host': 'localhost', + 'port': 3306, + 'user': 'root', + 'password': '123456', + 'charset': 'utf8mb4' + } +``` +如果你的配置和上面不同,需要修改main函数,将本程序的配置改成自己的配置,具体代码如下: +```python + mysql_config = { + 'host': 'xxx', + 'port': xxx, + 'user': 'xxx', + 'password': 'xxx', + 'charset': 'utf8mb4' + } + wb.change_mysql_config(mysql_config) +``` +**MongoDB数据库写入**
+要想将爬取信息写入MongoDB,请将main函数中的mongodb_write变量值改为1。再根据自己的系统环境安装MongoDB,然后命令行执行: +``` +$ pip install pymongo +``` +MySQL和MongDB数据库的写入内容一样。程序会创建一个名为"weibo"的数据库,再创建一个"weibo"表,包含爬取的所有内容,用户爬取的微博信息或插入或更新,都会存储在"weibo"表里,想了解数据库的具体字段,请点击"详情"。 +
+详情 + +**id**:存储微博id;
+**user_id**: 存储微博发布者的用户id;
+**content**:存储微博正文;
+**original_pictures**:存储原创微博的原始图片url和转发微博转发理由中的图片url。若某条微博有多张图片,则存储多个url,以英文逗号分割;若某微博没有图片,则值为"无";
+**retweet_pictures**:存储被转发微博中的原始图片url。当最新微博为原创微博或者为没有图片的转发微博时,则值为"无",否则为被转发微博的图片url。若有多张图片,则存储多个url,以英文逗号分割;
+**publish_place**:存储微博的发布位置。如果某条微博没有位置信息,则值为"无";
+**publish_time**:存储微博的发布时间;
+**up_num**:存储微博获得的点赞数;
+**retweet_num**:存储微博获得的转发数;
+**comment_num**:存储微博获得的评论数;
+**publish_tool**:存储微博的发布工具。 + +
+ +### 6.运行脚本 大家可以根据自己的运行环境选择运行方式,Linux可以通过 ```bash $ python weibospider.py ``` 运行; -### 6.按需求修改脚本(可选) -本脚本是一个Weibo类,用户可以按照自己的需求调用Weibo类。 -例如用户可以直接在"weibospider.py"文件中调用Weibo类,具体调用代码示例如下: +### 7.按需求修改脚本(可选) +本程序是一个Weibo类,用户可以按照自己的需求调用Weibo类。 +用户可以直接在**weibospider.py**文件中调用Weibo类,具体调用代码示例如下: ```python # 以下是程序配置信息,可以根据自己需求修改 filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 From c6359f0456d02ad270088f15fcc2cff8e9c4ab27 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 11 Sep 2019 21:30:33 +0800 Subject: [PATCH 071/363] Update README.md --- README.md | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 7be2741d..faaab378 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ * [注意事项](#注意事项) ## 功能 -爬取**一个**或**多个**新浪微博用户的数据,并将结果信息写入文件。写入信息几乎包括了用户微博的所有数据,主要有用户信息和微博信息两大类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[输出](#输出)部分。具体的写入文件类型如下: +爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入文件。写入信息几乎包括了用户微博的所有数据,主要有用户信息和微博信息两大类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[输出](#输出)部分。具体的写入文件类型如下: - 写入**txt文件**(默认) - 写入**csv文件**(默认) - 写入**MySQL数据库**(可选) @@ -57,8 +57,8 @@ """值为0代表不将结果写入MySQL数据库,1代表写入;若要写入MySQL数据库, 请先安装MySQL数据库和pymysql,pymysql安装方法为命令行运行:pip install pymysql""" mysql_write = 0 - pic_download = 0 # 值为0代表不下载微博原始图片,1代表下载微博原始图片 - video_download = 0 # 值为0代表不下载微博视频,1代表下载微博视频 + pic_download = 1 # 值为0代表不下载微博原始图片,1代表下载微博原始图片 + video_download = 1 # 值为0代表不下载微博视频,1代表下载微博视频 wb = Weibo(filter, since_date, mongodb_write, mysql_write, pic_download, video_download) @@ -103,16 +103,15 @@ $ pip install -r requirements.txt ### 3.设置cookie 打开weibospider文件夹下的**weibospider.py**文件,将"**your cookie**"替换成爬虫微博的cookie,具体替换位置大约在**weibospider.py**文件的22行左右。cookie获取方法见[如何获取cookie](#如何获取cookie); ### 4.设置user_id -打开weibospider文件夹下的**weibospider.py**文件,将我们想要爬取的**一个**或**多个**微博的user_id赋值给user_id_list,user_id获取方法见[如何获取user_id](#如何获取user_id); -user_id设置代码位于**weibospider.py**的main函数里,具体代码如下: +打开weibospider文件夹下的**weibospider.py**文件,将我们想要爬取的**一个**或**多个**微博的user_id赋值给user_id_list,user_id获取方法见[如何获取user_id](#如何获取user_id)。user_id设置代码位于**weibospider.py**的main函数里,具体代码如下: ```python -# 爬单个微博用户 +# 爬单个微博用户,可以改成任意合法的用户id user_id_list = ['1669879400'] ``` 或者 ```python -# 爬多个微博用户 -user_id_list = ['1669879400', '1729370543'] +# 爬多个微博用户,可以改成任意合法的用户id +user_id_list = ['1223178222', '1669879400', '1729370543'] ``` 或者 ```python @@ -151,7 +150,7 @@ MySQL写入需要主机、端口号、用户名、密码等配置,本程序默 ``` **MongoDB数据库写入**
要想将爬取信息写入MongoDB,请将main函数中的mongodb_write变量值改为1。再根据自己的系统环境安装MongoDB,然后命令行执行: -``` +```bash $ pip install pymongo ``` MySQL和MongDB数据库的写入内容一样。程序会创建一个名为"weibo"的数据库,再创建一个"weibo"表,包含爬取的所有内容,用户爬取的微博信息或插入或更新,都会存储在"weibo"表里,想了解数据库的具体字段,请点击"详情"。 @@ -215,12 +214,12 @@ $ python weibospider.py 爬单个微博,user_id_list如下所示,可以改成任意合法的用户id user_id_list = ['1669879400'] 爬多个微博,user_id_list如下所示,可以改成任意合法的用户id - user_id_list = ['1669879400', '1729370543'] + user_id_list = ['1223178222', '1669879400', '1729370543'] 也可以在文件中读取user_id_list,文件中可以包含很多user_id, 每个user_id占一行,文件名任意,类型为txt,位置位于本程序的同目录下, 比如文件可以叫user_id_list.txt,读取文件中的user_id_list如下所示: user_id_list = wb.get_user_list('user_id_list.txt')""" - user_id_list = ['1669879400', '1729370543'] + user_id_list = ['1223178222', '1669879400', '1729370543'] wb.start(user_id_list) # 爬取微博信息 ``` From 098ddbb13a73ad08b16026ff37671d38053b7f2e Mon Sep 17 00:00:00 2001 From: chenlei Date: Thu, 12 Sep 2019 23:16:03 +0800 Subject: [PATCH 072/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=B0=86?= =?UTF-8?q?=E7=94=A8=E6=88=B7=E4=BF=A1=E6=81=AF=E5=86=99=E5=85=A5MongoDB?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=BA=93user=E8=A1=A8=E7=9A=84=E5=8A=9F?= =?UTF-8?q?=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 81 +++++++++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 31 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 5ceb1635..9137d93a 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -48,12 +48,9 @@ def __init__(self, self.mysql_write = mysql_write # 值为0代表不将结果写入MySQL数据库,1代表写入 self.pic_download = pic_download # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 self.video_download = video_download # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 - self.nickname = '' # 用户昵称,如“Dear-迪丽热巴” - self.weibo_num = 0 # 用户全部微博数 self.got_num = 0 # 爬取到的微博数 - self.following = 0 # 用户关注数 - self.followers = 0 # 用户粉丝数 self.weibo = [] # 存储爬取到的所有微博信息 + self.user = {} # # 存储爬取到的用户信息 self.mysql_config = { } # MySQL数据库连接配置,可以不填,当使用者的mysql用户名、密码等与本程序默认值不同时,需要通过mysql_config来自定义 @@ -91,28 +88,45 @@ def get_nickname(self): url = 'https://weibo.cn/%s/info' % (self.user_id) selector = self.deal_html(url) nickname = selector.xpath('//title/text()')[0] - self.nickname = nickname[:-3] - if self.nickname == u'登录 - 新' or self.nickname == u'新浪': + nickname = nickname[:-3] + if nickname == u'登录 - 新' or nickname == u'新浪': sys.exit(u'cookie错误或已过期,请按照README中方法重新获取') - print(u'用户昵称: ' + self.nickname) + self.user['nickname'] = nickname + print(u'用户昵称: ' + nickname) except Exception as e: print('Error: ', e) traceback.print_exc() + def user_to_mongodb(self): + """将爬取的用户信息写入MongoDB数据库""" + user_list = [self.user] + self.info_to_mongodb('user', user_list) + print(u'%s信息写入MongoDB数据库完毕' % self.user['nickname']) + + def user_to_database(self): + """将用户信息写入数据库""" + if self.mongodb_write: + self.user_to_mongodb() + def get_user_info(self, selector): """获取用户昵称、微博数、关注数、粉丝数""" try: self.get_nickname() # 获取用户昵称 user_info = selector.xpath("//div[@class='tip2']/*/text()") - self.weibo_num = int(user_info[0][3:-1]) - print(u'微博数: ' + str(self.weibo_num)) + weibo_num = int(user_info[0][3:-1]) + print(u'微博数: ' + str(weibo_num)) - self.following = int(user_info[1][3:-1]) - print(u'关注数: ' + str(self.following)) + following = int(user_info[1][3:-1]) + print(u'关注数: ' + str(following)) - self.followers = int(user_info[2][3:-1]) - print(u'粉丝数: ' + str(self.followers)) + followers = int(user_info[2][3:-1]) + print(u'粉丝数: ' + str(followers)) + self.user['weibo_num'] = weibo_num + self.user['following'] = following + self.user['followers'] = followers + self.user['id'] = self.user_id + self.user_to_database() print('*' * 100) except Exception as e: print('Error: ', e) @@ -542,8 +556,9 @@ def get_one_page(self, page): def get_filepath(self, type): """获取结果文件路径""" try: - file_dir = os.path.split(os.path.realpath( - __file__))[0] + os.sep + 'weibo' + os.sep + self.nickname + file_dir = os.path.split( + os.path.realpath(__file__) + )[0] + os.sep + 'weibo' + os.sep + self.user['nickname'] if type == 'img' or type == 'video': file_dir = file_dir + os.sep + type if not os.path.isdir(file_dir): @@ -608,11 +623,11 @@ def write_txt(self, wrote_num): result_header = u'\n\n原创微博内容: \n' else: result_header = u'\n\n微博内容: \n' - result_header = (u'用户信息\n用户昵称:' + self.nickname + u'\n用户id: ' + - str(self.user_id) + u'\n微博数: ' + - str(self.weibo_num) + u'\n关注数: ' + - str(self.following) + u'\n粉丝数: ' + - str(self.followers) + result_header) + result_header = (u'用户信息\n用户昵称:' + self.user['nickname'] + + u'\n用户id: ' + str(self.user_id) + u'\n微博数: ' + + str(self.user['weibo_num']) + u'\n关注数: ' + + str(self.user['following']) + u'\n粉丝数: ' + + str(self.user['followers']) + result_header) temp_result.append(result_header) for i, w in enumerate(self.weibo[wrote_num:]): temp_result.append( @@ -631,19 +646,26 @@ def write_txt(self, wrote_num): print('Error: ', e) traceback.print_exc() - def write_mongodb(self, wrote_num): + def info_to_mongodb(self, collection, info_list): """将爬取的信息写入MongoDB数据库""" from pymongo import MongoClient client = MongoClient() db = client['weibo'] - collection = db['weibo'] + collection = db[collection] + for info in info_list: + if not collection.find_one({'id': info['id']}): + collection.insert_one(info) + else: + collection.update_one({'id': info['id']}, {'$set': info}) + + def weibo_to_mongodb(self, wrote_num): + """将爬取的微博信息写入MongoDB数据库""" + weibo_list = [] for w in self.weibo[wrote_num:]: w['user_id'] = self.user_id - if not collection.find_one({'id': w['id']}): - collection.insert_one(w) - else: - collection.update_one({'id': w['id']}, {'$set': w}) + weibo_list.append(w) + self.info_to_mongodb('weibo', weibo_list) print(u'%d条微博写入MongoDB数据库完毕' % self.got_num) def change_mysql_config(self, mysql_config): @@ -757,7 +779,7 @@ def write_data(self, wrote_num): if self.mysql_write: self.write_mysql(wrote_num) if self.mongodb_write: - self.write_mongodb(wrote_num) + self.weibo_to_mongodb(wrote_num) def get_weibo_info(self): """获取微博信息""" @@ -803,12 +825,9 @@ def get_user_list(self, file_name): def initialize_info(self, user_id): """初始化爬虫信息""" - self.nickname = '' - self.weibo_num = 0 self.got_num = 0 - self.following = 0 - self.followers = 0 self.weibo = [] + self.user = {} self.user_id = user_id def start(self, user_id_list): From 27c3b950557ca91aee01acf679b8a93288ccf607 Mon Sep 17 00:00:00 2001 From: chenlei Date: Thu, 12 Sep 2019 23:22:13 +0800 Subject: [PATCH 073/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=9B=A0?= =?UTF-8?q?=E6=97=B6=E9=97=B4=E4=B8=AD=E6=9C=89=E7=89=B9=E6=AE=8A=E5=AD=97?= =?UTF-8?q?=E7=AC=A6=E5=AF=BC=E8=87=B4=E6=97=A0=E6=B3=95=E5=86=99=E5=85=A5?= =?UTF-8?q?MySQL=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/weiboSpider.py b/weiboSpider.py index 9137d93a..c4a21a5a 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -277,6 +277,8 @@ def get_publish_time(self, info): today = datetime.now().strftime('%Y-%m-%d') time = publish_time[3:] publish_time = today + ' ' + time + if len(publish_time) > 16: + publish_time = publish_time[:16] elif u'月' in publish_time: year = datetime.now().strftime('%Y') month = publish_time[0:2] From 853ed0e84027c5d5e329c0957cf702343becd5b1 Mon Sep 17 00:00:00 2001 From: chenlei Date: Fri, 13 Sep 2019 02:29:22 +0800 Subject: [PATCH 074/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=B0=86?= =?UTF-8?q?=E7=94=A8=E6=88=B7=E4=BF=A1=E6=81=AF=E5=86=99=E5=85=A5MySQL?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=BA=93user=E8=A1=A8=E7=9A=84=E5=8A=9F?= =?UTF-8?q?=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index c4a21a5a..1552bf2c 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -103,8 +103,37 @@ def user_to_mongodb(self): self.info_to_mongodb('user', user_list) print(u'%s信息写入MongoDB数据库完毕' % self.user['nickname']) + def user_to_mysql(self): + """将爬取的用户信息写入MySQL数据库""" + mysql_config = { + 'host': 'localhost', + 'port': 3306, + 'user': 'root', + 'password': '123456', + 'charset': 'utf8mb4' + } + # 创建'weibo'数据库 + create_database = """CREATE DATABASE IF NOT EXISTS weibo DEFAULT + CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci""" + self.mysql_create_database(mysql_config, create_database) + # 创建'user'表 + create_table = """ + CREATE TABLE IF NOT EXISTS user ( + id varchar(12) NOT NULL, + nickname varchar(30), + weibo_num INT, + following INT, + followers INT, + PRIMARY KEY (id) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" + self.mysql_create_table(mysql_config, create_table) + self.mysql_insert(mysql_config, 'user', [self.user]) + print(u'%s信息写入MySQL数据库完毕' % self.user['nickname']) + def user_to_database(self): """将用户信息写入数据库""" + if self.mysql_write: + self.user_to_mysql() if self.mongodb_write: self.user_to_mongodb() @@ -733,7 +762,7 @@ def mysql_insert(self, mysql_config, table, data_list): finally: connection.close() - def write_mysql(self, wrote_num): + def weibo_to_mysql(self, wrote_num): """将爬取的信息写入MySQL数据库""" mysql_config = { 'host': 'localhost', @@ -742,10 +771,6 @@ def write_mysql(self, wrote_num): 'password': '123456', 'charset': 'utf8mb4' } - # 创建'weibo'数据库 - create_database = """CREATE DATABASE IF NOT EXISTS weibo DEFAULT - CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci""" - self.mysql_create_database(mysql_config, create_database) # 创建'weibo'表 create_table = """ CREATE TABLE IF NOT EXISTS weibo ( @@ -779,7 +804,7 @@ def write_data(self, wrote_num): self.write_csv(wrote_num) self.write_txt(wrote_num) if self.mysql_write: - self.write_mysql(wrote_num) + self.weibo_to_mysql(wrote_num) if self.mongodb_write: self.weibo_to_mongodb(wrote_num) From 807dfa96e05d3680eecb8d6ac709ff00c05f8a24 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Fri, 13 Sep 2019 21:13:52 +0800 Subject: [PATCH 075/363] Update README.md --- README.md | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index faaab378..601b3f83 100644 --- a/README.md +++ b/README.md @@ -20,20 +20,34 @@ - 写入**csv文件**(默认) - 写入**MySQL数据库**(可选) - 写入**MongoDB数据库**(可选) -- 下载用户微博中的原始**图片**(默认) -- 下载用户微博中的**视频**(默认)
+- 下载用户微博中的原始**图片**(可选) +- 下载用户微博中的**视频**(可选)
+ 本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需免cookie版,大家可以访问,二者功能类似,免cookie版获取的信息更多,用法更简单,而且不需要cookie。
## 输出 +本部分为爬取到的字段信息说明,为了与[免cookie版](https://github.com/dataabc/weibo-crawler)区分,下面将两者爬取到的信息都列出来。如果是免cookie版所特有的信息,会有免cookie标注,没有标注的为二者共有的信息。
**用户信息** +- 用户id:微博用户id,如"1669879400",其实这个字段本来就是已知字段 - 昵称:用户昵称,如"Dear-迪丽热巴" - 微博数:用户的全部微博数(转发微博+原创微博) - 关注数:用户关注的微博数量 - 粉丝数:用户的粉丝数 +- 性别(免cookie版):微博用户性别 +- 简介(免cookie版):用户简介 +- 主页地址(免cookie版):微博移动版主页url +- 头像url(免cookie版):用户头像url +- 高清头像url(免cookie版):用户高清头像url +- 微博等级(免cookie版):用户微博等级 +- 会员等级(免cookie版):微博会员用户等级,普通用户该等级为0 +- 是否认证(免cookie版):用户是否认证,为布尔类型 +- 认证类型(免cookie版):用户认证类型,如个人认证、企业认证、政府认证等 +- 认证信息(免cookie版):为认证用户特有,用户信息栏显示的认证信息 *** **微博信息** - 微博id:微博唯一标志 - 微博内容:微博正文 -- 原始图片url:原创微博图片和转发微博转发理由中图片的url,若某条微博存在多张图片,每个url以英文逗号分隔,若没有图片则值为无 +- 原始图片url:原创微博图片和转发微博转发理由中图片的url,若某条微博存在多张图片,每个url以英文逗号分隔,若没有图片则值为"无" +- 视频url: 微博中的视频url,若微博中没有视频,则值为"无" - 微博发布位置:位置微博中的发布位置 - 微博发布时间:微博发布时的时间,精确到分 - 点赞数:微博被赞的数量 @@ -43,6 +57,9 @@ - 结果文件:保存在当前目录weibo文件夹下以用户昵称为名的文件夹里,名字为"user_id.csv"和"user_id.txt"的形式 - 微博图片:原创微博中的图片和转发微博转发理由中的图片,保存在以用户昵称为名的文件夹下的img文件夹里 - 微博视频:原创微博中的视频,保存在以用户昵称为名的文件夹下的video文件夹里 +- 话题(免cookie版):微博话题,即两个#中的内容,若存在多个话题,每个url以英文逗号分隔,若没有则值为'' +- @用户(免cookie版):微博@的用户,若存在多个@用户,每个url以英文逗号分隔,若没有则值为'' +- 原始微博(免cookie版):为转发微博所特有,是转发微博中那条被转发的微博,存储为字典形式,包含了上述微博信息中的所有内容,如微博id、微博内容等等
## 实例 @@ -153,12 +170,20 @@ MySQL写入需要主机、端口号、用户名、密码等配置,本程序默 ```bash $ pip install pymongo ``` -MySQL和MongDB数据库的写入内容一样。程序会创建一个名为"weibo"的数据库,再创建一个"weibo"表,包含爬取的所有内容,用户爬取的微博信息或插入或更新,都会存储在"weibo"表里,想了解数据库的具体字段,请点击"详情"。 +MySQL和MongDB数据库的写入内容一样。程序会创建一个名为"weibo"的数据库,再创建一个"user"表和"weibo"表,包含爬取的所有内容。爬取到的微博用户信息或插入或更新,都会存储到user表里;爬取到的微博信息或插入或更新,都会存储到weibo表里,两个表通过user_id关联。想了解数据库的具体字段,请点击"详情"。
详情 - + +**user表**
+**id**:存储用户id,如"1669879400";
+**nickname**:存储用户昵称,如"Dear-迪丽热巴";
+**weibo_num**:存储微博数;
+**following**:存储关注数;
+**followers**:存储粉丝数。
+*** +**weibo表**
**id**:存储微博id;
-**user_id**: 存储微博发布者的用户id;
+**user_id**:存储微博发布者的用户id,如"1669879400";
**content**:存储微博正文;
**original_pictures**:存储原创微博的原始图片url和转发微博转发理由中的图片url。若某条微博有多张图片,则存储多个url,以英文逗号分割;若某微博没有图片,则值为"无";
**retweet_pictures**:存储被转发微博中的原始图片url。当最新微博为原创微博或者为没有图片的转发微博时,则值为"无",否则为被转发微博的图片url。若有多张图片,则存储多个url,以英文逗号分割;
From 9e321cf39c7aabd48e7b8ab1e3cae17df6058fe8 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Fri, 13 Sep 2019 21:54:16 +0800 Subject: [PATCH 076/363] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 601b3f83..64749df3 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ * [注意事项](#注意事项) ## 功能 -爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入文件。写入信息几乎包括了用户微博的所有数据,主要有用户信息和微博信息两大类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[输出](#输出)部分。具体的写入文件类型如下: +连续爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入文件。写入信息几乎包括了用户微博的所有数据,主要有**用户信息**和**微博信息**两大类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[输出](#输出)部分。具体的写入文件类型如下: - 写入**txt文件**(默认) - 写入**csv文件**(默认) - 写入**MySQL数据库**(可选) @@ -34,7 +34,7 @@ - 粉丝数:用户的粉丝数 - 性别(免cookie版):微博用户性别 - 简介(免cookie版):用户简介 -- 主页地址(免cookie版):微博移动版主页url +- 主页地址(免cookie版):微博移动版主页url,如 - 头像url(免cookie版):用户头像url - 高清头像url(免cookie版):用户高清头像url - 微博等级(免cookie版):用户微博等级 @@ -170,7 +170,7 @@ MySQL写入需要主机、端口号、用户名、密码等配置,本程序默 ```bash $ pip install pymongo ``` -MySQL和MongDB数据库的写入内容一样。程序会创建一个名为"weibo"的数据库,再创建一个"user"表和"weibo"表,包含爬取的所有内容。爬取到的微博用户信息或插入或更新,都会存储到user表里;爬取到的微博信息或插入或更新,都会存储到weibo表里,两个表通过user_id关联。想了解数据库的具体字段,请点击"详情"。 +MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为"weibo"的数据库,然后再创建"user"表和"weibo"表,包含爬取的所有内容。爬取到的微博**用户信息**或插入或更新,都会存储到user表里;爬取到的**微博信息**或插入或更新,都会存储到weibo表里,两个表通过user_id关联。如果想了解两个表的具体字段,请点击"详情"。
详情 From 599c7de4abe780b0fb38595655922f553d6a7d72 Mon Sep 17 00:00:00 2001 From: chenlei Date: Sun, 15 Sep 2019 18:11:27 +0800 Subject: [PATCH 077/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E6=89=93?= =?UTF-8?q?=E5=8D=B0=E5=BE=AE=E5=8D=9A=E7=94=A8=E6=88=B7=E4=BF=A1=E6=81=AF?= =?UTF-8?q?=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 1552bf2c..dbad9068 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -92,7 +92,6 @@ def get_nickname(self): if nickname == u'登录 - 新' or nickname == u'新浪': sys.exit(u'cookie错误或已过期,请按照README中方法重新获取') self.user['nickname'] = nickname - print(u'用户昵称: ' + nickname) except Exception as e: print('Error: ', e) traceback.print_exc() @@ -137,20 +136,22 @@ def user_to_database(self): if self.mongodb_write: self.user_to_mongodb() + def print_user(self): + """打印微博用户信息""" + print(u'用户昵称: ' + self.user['nickname']) + print(u'用户id: ' + self.user['id']) + print(u'微博数: ' + self.user['weibo_num']) + print(u'关注数: ' + self.user['following']) + print(u'粉丝数: ' + self.user['followers']) + def get_user_info(self, selector): """获取用户昵称、微博数、关注数、粉丝数""" try: self.get_nickname() # 获取用户昵称 user_info = selector.xpath("//div[@class='tip2']/*/text()") - weibo_num = int(user_info[0][3:-1]) - print(u'微博数: ' + str(weibo_num)) - following = int(user_info[1][3:-1]) - print(u'关注数: ' + str(following)) - followers = int(user_info[2][3:-1]) - print(u'粉丝数: ' + str(followers)) self.user['weibo_num'] = weibo_num self.user['following'] = following self.user['followers'] = followers @@ -763,7 +764,7 @@ def mysql_insert(self, mysql_config, table, data_list): connection.close() def weibo_to_mysql(self, wrote_num): - """将爬取的信息写入MySQL数据库""" + """将爬取的微博信息写入MySQL数据库""" mysql_config = { 'host': 'localhost', 'port': 3306, From a9fbb8666a7261fc83b4346e3639b2d2a7905a98 Mon Sep 17 00:00:00 2001 From: chenlei Date: Sun, 15 Sep 2019 19:00:23 +0800 Subject: [PATCH 078/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=BE=AE?= =?UTF-8?q?=E5=8D=9A=E7=88=AC=E5=8F=96=E9=87=8D=E5=A4=8D=E7=9A=84=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index dbad9068..6747bd80 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -50,7 +50,8 @@ def __init__(self, self.video_download = video_download # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 self.got_num = 0 # 爬取到的微博数 self.weibo = [] # 存储爬取到的所有微博信息 - self.user = {} # # 存储爬取到的用户信息 + self.user = {} # 存储爬取到的用户信息 + self.weibo_id_list = [] # 存储爬取到的所有微博id self.mysql_config = { } # MySQL数据库连接配置,可以不填,当使用者的mysql用户名、密码等与本程序默认值不同时,需要通过mysql_config来自定义 @@ -136,7 +137,7 @@ def user_to_database(self): if self.mongodb_write: self.user_to_mongodb() - def print_user(self): + def print_user_info(self): """打印微博用户信息""" print(u'用户昵称: ' + self.user['nickname']) print(u'用户id: ' + self.user['id']) @@ -156,6 +157,7 @@ def get_user_info(self, selector): self.user['following'] = following self.user['followers'] = followers self.user['id'] = self.user_id + self.print_user_info() self.user_to_database() print('*' * 100) except Exception as e: @@ -568,6 +570,8 @@ def get_one_page(self, page): for i in range(0, len(info) - 2): weibo = self.get_one_weibo(info[i]) if weibo: + if weibo['id'] in self.weibo_id_list: + continue publish_time = datetime.strptime( weibo['publish_time'][:10], "%Y-%m-%d") since_date = datetime.strptime(self.since_date, @@ -579,6 +583,7 @@ def get_one_page(self, page): return True self.print_one_weibo(weibo) self.weibo.append(weibo) + self.weibo_id_list.append(weibo['id']) self.got_num += 1 print('-' * 100) except Exception as e: @@ -857,6 +862,7 @@ def initialize_info(self, user_id): self.weibo = [] self.user = {} self.user_id = user_id + self.weibo_id_list = [] def start(self, user_id_list): """运行爬虫""" From f4f73175fa6da34915892cae9d48cc5b0234c8ec Mon Sep 17 00:00:00 2001 From: chenlei Date: Sun, 15 Sep 2019 19:41:46 +0800 Subject: [PATCH 079/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=89=93?= =?UTF-8?q?=E5=8D=B0=E5=BE=AE=E5=8D=9A=E7=94=A8=E6=88=B7=E5=87=BA=E9=94=99?= =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 6747bd80..760b8592 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -139,11 +139,11 @@ def user_to_database(self): def print_user_info(self): """打印微博用户信息""" - print(u'用户昵称: ' + self.user['nickname']) - print(u'用户id: ' + self.user['id']) - print(u'微博数: ' + self.user['weibo_num']) - print(u'关注数: ' + self.user['following']) - print(u'粉丝数: ' + self.user['followers']) + print(u'用户昵称: %s' % self.user['nickname']) + print(u'用户id: %s' % self.user['id']) + print(u'微博数: %d' % self.user['weibo_num']) + print(u'关注数: %d' % self.user['following']) + print(u'粉丝数: %d' % self.user['followers']) def get_user_info(self, selector): """获取用户昵称、微博数、关注数、粉丝数""" From 4bc16fb6a778999439a295c646042a9c4432622d Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Mon, 30 Sep 2019 18:51:27 +0800 Subject: [PATCH 080/363] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 64749df3..5a0a9fb7 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,7 @@ - 结果文件:保存在当前目录weibo文件夹下以用户昵称为名的文件夹里,名字为"user_id.csv"和"user_id.txt"的形式 - 微博图片:原创微博中的图片和转发微博转发理由中的图片,保存在以用户昵称为名的文件夹下的img文件夹里 - 微博视频:原创微博中的视频,保存在以用户昵称为名的文件夹下的video文件夹里 +- 微博bid(免cookie版):为[免cookie版](https://github.com/dataabc/weibo-crawler)所特有,与本程序中的微博id是同一个值 - 话题(免cookie版):微博话题,即两个#中的内容,若存在多个话题,每个url以英文逗号分隔,若没有则值为'' - @用户(免cookie版):微博@的用户,若存在多个@用户,每个url以英文逗号分隔,若没有则值为'' - 原始微博(免cookie版):为转发微博所特有,是转发微博中那条被转发的微博,存储为字典形式,包含了上述微博信息中的所有内容,如微博id、微博内容等等 From 435263b828ef16ba8b1d7e2a9a642dbcc5eede19 Mon Sep 17 00:00:00 2001 From: chenlei Date: Tue, 8 Oct 2019 20:02:06 +0800 Subject: [PATCH 081/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E7=94=A8?= =?UTF-8?q?=E6=88=B7id=E6=96=87=E4=BB=B6=E8=AF=BB=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 主要增加了对从文件读取用户id时数据的处理,可以在用户id后面添加注释,如用户昵称,id和注释之间必需要要空格,文件内容示例如下: ``` 1223178222 胡歌 1669879400 迪丽热巴 1729370543 郭碧婷 ``` 也可以另起一行添加注释,如: ``` #胡歌 1223178222 #迪丽热巴 1669879400 #郭碧婷 1729370543 ``` --- weiboSpider.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 760b8592..529781ed 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -852,8 +852,13 @@ def get_weibo_info(self): def get_user_list(self, file_name): """获取文件中的微博id信息""" - with open(file_name, 'r') as f: - user_id_list = f.read().splitlines() + with open(file_name, 'rb') as f: + lines = f.read().splitlines() + lines = [line.decode('utf-8') for line in lines] + user_id_list = [ + line.split(' ')[0] for line in lines + if len(line.split(' ')) > 0 and line.split(' ')[0].isdigit() + ] return user_id_list def initialize_info(self, user_id): @@ -869,6 +874,7 @@ def start(self, user_id_list): try: for user_id in user_id_list: self.initialize_info(user_id) + print('*' * 100) self.get_weibo_info() print(u'信息抓取完毕') print('*' * 100) @@ -918,7 +924,11 @@ def main(): 爬多个微博,user_id_list如下所示,可以改成任意合法的用户id user_id_list = ['1669879400', '1729370543'] 也可以在文件中读取user_id_list,文件中可以包含很多user_id, - 每个user_id占一行,文件名任意,类型为txt,位置位于本程序的同目录下, + 每个user_id占一行,也可以在user_id后面加注释,如用户昵称,user_id和注释之间必需要有空格, + 文件名任意,类型为txt,位置位于本程序的同目录下,文件内容可以为如下形式: + 1223178222 胡歌 + 1669879400 迪丽热巴 + 1729370543 郭碧婷 比如文件可以叫user_id_list.txt,读取文件中的user_id_list如下所示: user_id_list = wb.get_user_list('user_id_list.txt')""" user_id_list = ['1669879400'] From a5a57d9299daffeb257e3aca34d1ac78ca0210d7 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 8 Oct 2019 20:43:13 +0800 Subject: [PATCH 082/363] Update README.md --- README.md | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 5a0a9fb7..a644220c 100644 --- a/README.md +++ b/README.md @@ -131,11 +131,14 @@ user_id_list = ['1669879400'] # 爬多个微博用户,可以改成任意合法的用户id user_id_list = ['1223178222', '1669879400', '1729370543'] ``` -或者 +也可以读取文件中的用户id,每个user_id占一行,也可以在user_id后面加注释(可选),如用户昵称等信息,user_id和注释之间必需要有空格,文件名任意,类型为txt,位置位于本程序的同目录下,文件内容示例如下: +``` +1223178222 胡歌 +1669879400 迪丽热巴 +1729370543 郭碧婷 +``` +假如文件叫user_id_list.txt,则user_id设置代码为: ```python -"""可以在文件中读取user_id_list,文件中可以包含很多user_id, -每个user_id占一行,文件名任意,类型为txt,位置位于本程序的同目录下, -比如文件可以叫user_id_list.txt""" user_id_list = wb.get_user_list('user_id_list.txt') ``` ### 5.设置数据库(可选) @@ -242,7 +245,11 @@ $ python weibospider.py 爬多个微博,user_id_list如下所示,可以改成任意合法的用户id user_id_list = ['1223178222', '1669879400', '1729370543'] 也可以在文件中读取user_id_list,文件中可以包含很多user_id, - 每个user_id占一行,文件名任意,类型为txt,位置位于本程序的同目录下, + 每个user_id占一行,也可以在user_id后面加注释,如用户昵称,user_id和注释之间必需要有空格, + 文件名任意,类型为txt,位置位于本程序的同目录下,文件内容可以为如下形式: + 1223178222 胡歌 + 1669879400 迪丽热巴 + 1729370543 郭碧婷 比如文件可以叫user_id_list.txt,读取文件中的user_id_list如下所示: user_id_list = wb.get_user_list('user_id_list.txt')""" user_id_list = ['1223178222', '1669879400', '1729370543'] From 0bb914093862ae90213f09cbd854ac6a25a6bb50 Mon Sep 17 00:00:00 2001 From: songzy Date: Sat, 26 Oct 2019 00:47:35 +0800 Subject: [PATCH 083/363] code refactor --- .gitignore | 3 + config_sample.json | 18 + downloader.py | 67 ++++ html_parser.py | 325 ++++++++++++++++ printer.py | 18 + spider.py | 150 +++++++ validator.py | 38 ++ weiboSpider.py | 943 --------------------------------------------- writer.py | 261 +++++++++++++ 9 files changed, 880 insertions(+), 943 deletions(-) create mode 100644 .gitignore create mode 100644 config_sample.json create mode 100644 downloader.py create mode 100644 html_parser.py create mode 100644 printer.py create mode 100644 spider.py create mode 100644 validator.py delete mode 100644 weiboSpider.py create mode 100644 writer.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..60c44892 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +config.json +*.pyc +weibo/ \ No newline at end of file diff --git a/config_sample.json b/config_sample.json new file mode 100644 index 00000000..5af9b697 --- /dev/null +++ b/config_sample.json @@ -0,0 +1,18 @@ +{ + "filter": 1, + "since_date": "2018-01-01", + "write_mode": "txt", + "pic_download": 1, + "video_download": 1, + "cookie": "your cookie", + "mysql_config": { + "host": "localhost", + "port": 3306, + "user": "root", + "password": "123456", + "charset": "utf8mb4" + }, + "user_id_list": [ + "1669879400" + ] +} \ No newline at end of file diff --git a/downloader.py b/downloader.py new file mode 100644 index 00000000..926226bf --- /dev/null +++ b/downloader.py @@ -0,0 +1,67 @@ +import os +import sys +from tqdm import tqdm +import traceback +import requests +from requests.adapters import HTTPAdapter + + +class Downloader: + def __init__(self, config): + self.config = config + + def download_files(self, file_path, type, weibo): + """下载文件(图片/视频)""" + try: + + if type == 'img': + describe = u'图片' + key = 'original_pictures' + else: + describe = u'视频' + key = 'video_url' + print(u'即将进行%s下载' % describe) + for w in tqdm(weibo, desc=u'%s下载进度' % describe): + if w[key] != u'无': + file_prefix = w['publish_time'][:11].replace( + '-', '') + '_' + w['id'] + if type == 'img' and ',' in w[key]: + w[key] = w[key].split(',') + for j, url in enumerate(w[key]): + file_suffix = url[url.rfind('.'):] + file_name = file_prefix + '_' + str( + j + 1) + file_suffix + self.download_one_file( + url, file_path + os.sep + file_name, type, + w['id']) + else: + if type == 'video': + file_suffix = '.mp4' + else: + file_suffix = w[key][w[key].rfind('.'):] + file_name = file_prefix + file_suffix + self.download_one_file(w[key], + file_path + os.sep + file_name, + type, w['id']) + print(u'%s下载完毕,保存路径:' % describe) + print(file_path) + except Exception as e: + print('Error: ', e) + traceback.print_exc() + + def download_one_file(self, url, file_path, type, weibo_id): + """下载单个文件(图片/视频)""" + try: + if not os.path.isfile(file_path): + s = requests.Session() + s.mount(url, HTTPAdapter(max_retries=5)) + downloaded = s.get(url, timeout=(5, 10)) + with open(file_path, 'wb') as f: + f.write(downloaded.content) + except Exception as e: + error_file = './not_downloaded.txt' + with open(error_file, 'ab') as f: + url = weibo_id + ':' + url + '\n' + f.write(url.encode(sys.stdout.encoding)) + print('Error: ', e) + traceback.print_exc() diff --git a/html_parser.py b/html_parser.py new file mode 100644 index 00000000..f82b9986 --- /dev/null +++ b/html_parser.py @@ -0,0 +1,325 @@ +import os +import requests +import sys +from lxml import etree +import traceback +import re +from datetime import datetime, timedelta +from collections import OrderedDict + + +class Parser: + def __init__(self, config): + self.config = config + + def deal_html(self, url, cookie): + """处理html""" + print("url:", url) + html = requests.get(url, cookies=cookie).content + selector = etree.HTML(html) + return selector + + def deal_garbled(self, info): + """处理乱码""" + info = (info.xpath('string(.)').replace(u'\u200b', '').encode( + sys.stdout.encoding, 'ignore').decode(sys.stdout.encoding)) + return info + + def extract_picture_urls(self, info, weibo_id): + """提取微博原始图片url""" + try: + a_list = info.xpath('div/a/@href') + first_pic = 'https://weibo.cn/mblog/pic/' + weibo_id + '?rl=0' + all_pic = 'https://weibo.cn/mblog/picAll/' + weibo_id + '?rl=1' + if first_pic in a_list: + if all_pic in a_list: + selector = self.deal_html(all_pic, self.config['cookie']) + preview_picture_list = selector.xpath('//img/@src') + picture_list = [ + p.replace('/thumb180/', '/large/') + for p in preview_picture_list + ] + picture_urls = ','.join(picture_list) + else: + if info.xpath('.//img/@src'): + preview_picture = info.xpath('.//img/@src')[-1] + picture_urls = preview_picture.replace( + '/wap180/', '/large/') + else: + sys.exit( + u"爬虫微博可能被设置成了'不显示图片',请前往" + u"'https://weibo.cn/account/customize/pic',修改为'显示'" + ) + else: + picture_urls = u'无' + return picture_urls + except Exception as e: + return u'无' + + def get_picture_urls(self, info, is_original): + """获取微博原始图片url""" + try: + weibo_id = info.xpath('@id')[0][2:] + picture_urls = {} + if is_original: + original_pictures = self.extract_picture_urls(info, weibo_id) + picture_urls['original_pictures'] = original_pictures + if not self.config['filter']: + picture_urls['retweet_pictures'] = u'无' + else: + retweet_url = info.xpath("div/a[@class='cc']/@href")[0] + retweet_id = retweet_url.split('/')[-1].split('?')[0] + retweet_pictures = self.extract_picture_urls(info, retweet_id) + picture_urls['retweet_pictures'] = retweet_pictures + a_list = info.xpath('div[last()]/a/@href') + original_picture = u'无' + for a in a_list: + if a.endswith(('.gif', '.jpeg', '.jpg', '.png')): + original_picture = a + break + picture_urls['original_pictures'] = original_picture + return picture_urls + except Exception as e: + print('Error: ', e) + traceback.print_exc() + + def get_video_url(self, info, is_original): + """获取微博视频url""" + try: + if is_original: + div_first = info.xpath('div')[0] + a_list = div_first.xpath('.//a') + video_link = u'无' + for a in a_list: + if 'm.weibo.cn/s/video/show?object_id=' in a.xpath( + '@href')[0]: + video_link = a.xpath('@href')[0] + break + if video_link != u'无': + video_link = video_link.replace( + 'm.weibo.cn/s/video/show', 'm.weibo.cn/s/video/object') + wb_info = requests.get( + video_link, cookies=self.config['cookie']).json() + video_url = wb_info['data']['object']['stream'].get( + 'hd_url') + if not video_url: + video_url = wb_info['data']['object']['stream']['url'] + if not video_url: # 说明该视频为直播 + video_url = u'无' + else: + video_url = u'无' + return video_url + except Exception as e: + return u'无' + + def get_page_num(self, selector): + """获取微博总页数""" + + if selector.xpath("//input[@name='mp']") == []: + page_num = 1 + else: + page_num = (int)( + selector.xpath("//input[@name='mp']")[0].attrib['value']) + return page_num + + def get_long_weibo(self, weibo_link): + """获取长原创微博""" + + selector = self.deal_html(weibo_link, self.config['cookie']) + info = selector.xpath("//div[@class='c']")[1] + wb_content = self.deal_garbled(info) + wb_time = info.xpath("//span[@class='ct']/text()")[0] + weibo_content = wb_content[wb_content.find(':') + + 1:wb_content.rfind(wb_time)] + return weibo_content + + def get_original_weibo(self, info, weibo_id): + """获取原创微博""" + + weibo_content = self.deal_garbled(info) + weibo_content = weibo_content[:weibo_content.rfind(u'赞')] + a_text = info.xpath('div//a/text()') + if u'全文' in a_text: + weibo_link = 'https://weibo.cn/comment/' + weibo_id + wb_content = self.get_long_weibo(weibo_link) + if wb_content: + weibo_content = wb_content + return weibo_content + + def get_long_retweet(self, weibo_link): + """获取长转发微博""" + wb_content = self.get_long_weibo(weibo_link) + weibo_content = wb_content[:wb_content.rfind(u'原文转发')] + return weibo_content + + def get_retweet(self, info, weibo_id): + """获取转发微博""" + original_user = info.xpath("div/span[@class='cmt']/a/text()") + if not original_user: + wb_content = u'转发微博已被删除' + return wb_content + else: + original_user = original_user[0] + wb_content = self.deal_garbled(info) + wb_content = wb_content[wb_content.find(':') + + 1:wb_content.rfind(u'赞')] + wb_content = wb_content[:wb_content.rfind(u'赞')] + a_text = info.xpath('div//a/text()') + if u'全文' in a_text: + weibo_link = 'https://weibo.cn/comment/' + weibo_id + weibo_content = self.get_long_retweet(weibo_link) + if weibo_content: + wb_content = weibo_content + retweet_reason = self.deal_garbled(info.xpath('div')[-1]) + retweet_reason = retweet_reason[:retweet_reason.rindex(u'赞')] + wb_content = (retweet_reason + '\n' + u'原始用户: ' + original_user + + '\n' + u'转发内容: ' + wb_content) + return wb_content + + def is_original(self, info): + """判断微博是否为原创微博""" + is_original = info.xpath("div/span[@class='cmt']") + if len(is_original) > 3: + return False + else: + return True + + def get_weibo_content(self, info, is_original): + """获取微博内容""" + weibo_id = info.xpath('@id')[0][2:] + if is_original: + weibo_content = self.get_original_weibo(info, weibo_id) + else: + weibo_content = self.get_retweet(info, weibo_id) + return weibo_content + + def get_publish_place(self, info): + """获取微博发布位置""" + div_first = info.xpath('div')[0] + a_list = div_first.xpath('a') + publish_place = u'无' + for a in a_list: + if ('place.weibo.com' in a.xpath('@href')[0] + and a.xpath('text()')[0] == u'显示地图'): + weibo_a = div_first.xpath("span[@class='ctt']/a") + if len(weibo_a) >= 1: + publish_place = weibo_a[-1] + if (u'视频' == div_first.xpath("span[@class='ctt']/a/text()") + [-1][-2:]): + if len(weibo_a) >= 2: + publish_place = weibo_a[-2] + else: + publish_place = u'无' + publish_place = self.deal_garbled(publish_place) + break + return publish_place + + def get_publish_time(self, info): + """获取微博发布时间""" + try: + str_time = info.xpath("div/span[@class='ct']") + str_time = self.deal_garbled(str_time[0]) + publish_time = str_time.split(u'来自')[0] + if u'刚刚' in publish_time: + publish_time = datetime.now().strftime('%Y-%m-%d %H:%M') + elif u'分钟' in publish_time: + minute = publish_time[:publish_time.find(u'分钟')] + minute = timedelta(minutes=int(minute)) + publish_time = (datetime.now() - + minute).strftime('%Y-%m-%d %H:%M') + elif u'今天' in publish_time: + today = datetime.now().strftime('%Y-%m-%d') + time = publish_time[3:] + publish_time = today + ' ' + time + if len(publish_time) > 16: + publish_time = publish_time[:16] + elif u'月' in publish_time: + year = datetime.now().strftime('%Y') + month = publish_time[0:2] + day = publish_time[3:5] + time = publish_time[7:12] + publish_time = year + '-' + month + '-' + day + ' ' + time + else: + publish_time = publish_time[:16] + return publish_time + except Exception as e: + print('Error: ', e) + traceback.print_exc() + + def get_publish_tool(self, info): + """获取微博发布工具""" + try: + str_time = info.xpath("div/span[@class='ct']") + str_time = self.deal_garbled(str_time[0]) + if len(str_time.split(u'来自')) > 1: + publish_tool = str_time.split(u'来自')[1] + else: + publish_tool = u'无' + return publish_tool + except Exception as e: + print('Error: ', e) + traceback.print_exc() + + def get_weibo_footer(self, info): + """获取微博点赞数、转发数、评论数""" + try: + footer = {} + pattern = r'\d+' + str_footer = info.xpath('div')[-1] + str_footer = self.deal_garbled(str_footer) + str_footer = str_footer[str_footer.rfind(u'赞'):] + weibo_footer = re.findall(pattern, str_footer, re.M) + + up_num = int(weibo_footer[0]) + footer['up_num'] = up_num + + retweet_num = int(weibo_footer[1]) + footer['retweet_num'] = retweet_num + + comment_num = int(weibo_footer[2]) + footer['comment_num'] = comment_num + return footer + except Exception as e: + print('Error: ', e) + traceback.print_exc() + + def get_one_weibo(self, info): + """获取一条微博的全部信息""" + try: + weibo = OrderedDict() + is_original = self.is_original(info) + if (not self.config['filter']) or is_original: + weibo['id'] = info.xpath('@id')[0][2:] + weibo['content'] = self.get_weibo_content(info, + is_original) # 微博内容 + weibo['publish_place'] = self.get_publish_place(info) # 微博发布位置 + weibo['publish_time'] = self.get_publish_time(info) # 微博发布时间 + weibo['publish_tool'] = self.get_publish_tool(info) # 微博发布工具 + footer = self.get_weibo_footer(info) + weibo['up_num'] = footer['up_num'] # 微博点赞数 + weibo['retweet_num'] = footer['retweet_num'] # 转发数 + weibo['comment_num'] = footer['comment_num'] # 评论数 + + picture_urls = self.get_picture_urls(info, is_original) + weibo['original_pictures'] = picture_urls[ + 'original_pictures'] # 原创图片url + if not self.config['filter']: + weibo['retweet_pictures'] = picture_urls[ + 'retweet_pictures'] # 转发图片url + weibo['original'] = is_original # 是否原创微博 + weibo['video_url'] = self.get_video_url(info, + is_original) # 微博视频url + else: + weibo = None + return weibo + except Exception as e: + print('Error: ', e) + traceback.print_exc() + + def is_pinned_weibo(self, info): + """判断微博是否为置顶微博""" + kt = info.xpath(".//span[@class='kt']/text()") + if kt and kt[0] == u'置顶': + return True + else: + return False \ No newline at end of file diff --git a/printer.py b/printer.py new file mode 100644 index 00000000..8c126bf6 --- /dev/null +++ b/printer.py @@ -0,0 +1,18 @@ +class Printer: + def print_one_weibo(self, weibo): + """打印一条微博""" + print(weibo['content']) + print(u'微博发布位置:%s' % weibo['publish_place']) + print(u'微博发布时间:%s' % weibo['publish_time']) + print(u'微博发布工具:%s' % weibo['publish_tool']) + print(u'点赞数:%d' % weibo['up_num']) + print(u'转发数:%d' % weibo['retweet_num']) + print(u'评论数:%d' % weibo['comment_num']) + + def print_user_info(self, user): + """打印微博用户信息""" + print(u'用户昵称: %s' % user['nickname']) + print(u'用户id: %s' % user['user_id']) + print(u'微博数: %d' % user['weibo_num']) + print(u'关注数: %d' % user['following']) + print(u'粉丝数: %d' % user['followers']) diff --git a/spider.py b/spider.py new file mode 100644 index 00000000..8faed152 --- /dev/null +++ b/spider.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +import codecs +import csv +import os +import random +import re +import sys +import traceback +from collections import OrderedDict +from datetime import datetime, timedelta +from time import sleep + +from lxml import etree +import requests +from tqdm import tqdm + +from validator import Validator +from printer import Printer +from writer import Writer, get_filepath +from downloader import Downloader +from html_parser import Parser + + +class Spider(object): + def __init__(self, config): + """Weibo类初始化""" + self.config = config + # change cookie from string to dict + if type(self.config['cookie']) == type(''): + self.config['cookie'] = { + t.strip().split("=")[0]: t.strip().split("=")[1] + for t in self.config['cookie'].split(";") + } + self.validator = Validator(self.config) + self.validator.validate() + self.printer = Printer() + self.writer = Writer(self.config) + self.downloader = Downloader(self.config) + self.parser = Parser(self.config) + + def get_nickname(self): + """获取用户昵称""" + url = 'https://weibo.cn/%s/info' % (self.user['user_id']) + selector = self.parser.deal_html(url, self.config['cookie']) + nickname = selector.xpath('//title/text()')[0] + nickname = nickname[:-3] + if nickname == u'登录 - 新' or nickname == u'新浪': + sys.exit(u'cookie错误或已过期,请按照README中方法重新获取') + self.user['nickname'] = nickname + + def get_user_info(self, selector): + """获取用户昵称、微博数、关注数、粉丝数""" + self.get_nickname() # 获取用户昵称 + user_info = selector.xpath("//div[@class='tip2']/*/text()") + + self.user['weibo_num'] = int(user_info[0][3:-1]) + self.user['following'] = int(user_info[1][3:-1]) + self.user['followers'] = int(user_info[2][3:-1]) + self.printer.print_user_info(self.user) + self.writer.write_user(self.user) + print('*' * 100) + + def get_one_page(self, page): + """获取第page页的全部微博""" + url = 'https://weibo.cn/u/%s?page=%d' % (self.user['user_id'], page) + selector = self.parser.deal_html(url, self.config['cookie']) + info = selector.xpath("//div[@class='c']") + is_exist = info[0].xpath("div/span[@class='ctt']") + if is_exist: + for i in range(0, len(info) - 2): + weibo = self.parser.get_one_weibo(info[i]) + if weibo: + if weibo['id'] in self.weibo_id_list: + continue + publish_time = datetime.strptime( + weibo['publish_time'][:10], "%Y-%m-%d") + since_date = datetime.strptime(self.config['since_date'], + "%Y-%m-%d") + if publish_time < since_date: + if self.parser.is_pinned_weibo(info[i]): + continue + else: + return True + self.printer.print_one_weibo(weibo) + + self.weibo.append(weibo) + self.weibo_id_list.append(weibo['id']) + self.got_num += 1 + print('-' * 100) + + self.writer.write_weibo([weibo]) + + def get_weibo_info(self): + """获取微博信息""" + url = 'https://weibo.cn/u/%s' % (self.user['user_id']) + selector = self.parser.deal_html(url, self.config['cookie']) + self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 + + page_num = self.parser.get_page_num(selector) # 获取微博总页数 + page1 = 0 + random_pages = random.randint(1, 5) + for page in tqdm(range(1, page_num + 1), desc=u'进度'): + is_end = self.get_one_page(page) # 获取第page页的全部微博 + if is_end: + break + + # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限 + # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默 + # 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间 + if page - page1 == random_pages and page < page_num: + sleep(random.randint(6, 10)) + page1 = page + random_pages = random.randint(1, 5) + + if not self.config['filter']: + print(u'共爬取' + str(self.got_num) + u'条微博') + else: + print(u'共爬取' + str(self.got_num) + u'条原创微博') + + def initialize_info(self, user_id): + """初始化爬虫信息""" + self.got_num = 0 # 爬取到的微博数 + self.weibo = [] # 存储爬取到的所有微博信息 + self.user = {'user_id': user_id} # 存储爬取到的用户信息 + self.weibo_id_list = [] # 存储爬取到的所有微博id + + def start(self): + """运行爬虫""" + for user_id in self.config['user_id_list']: + self.initialize_info(user_id) + print('*' * 100) + self.get_weibo_info() + print(u'信息抓取完毕') + print('*' * 100) + if self.config['pic_download'] == 1: + file_path = get_filepath('img', self.user['nickname']) + self.downloader.download_files(file_path, 'img', self.weibo) + if self.config['video_download'] == 1: + file_path = get_filepath('video', self.user['nickname']) + self.downloader.download_files(file_path, 'video', self.weibo) + + +if __name__ == '__main__': + import json + with open("./config.json") as f: + config = json.loads(f.read()) + spider = Spider(config) + spider.start() # 爬取微博信息 diff --git a/validator.py b/validator.py new file mode 100644 index 00000000..aecb7873 --- /dev/null +++ b/validator.py @@ -0,0 +1,38 @@ +from datetime import datetime +import sys + +class Validator: + def __init__(self, config): + """ + self.user_id = '' # 用户id,如昵称为"Dear-迪丽热巴"的id为'1669879400' + self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 + self.since_date = since_date # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd + self.mongodb_write = mongodb_write # 值为0代表不将结果写入MongoDB数据库,1代表写入 + self.mysql_write = mysql_write # 值为0代表不将结果写入MySQL数据库,1代表写入 + self.pic_download = pic_download # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 + self.video_download = video_download # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 + self.mysql_config = { + } # MySQL数据库连接配置,可以不填,当使用者的mysql用户名、密码等与本程序默认值不同时,需要通过mysql_config来自定义 + """ + self.config = config + + def validate(self): + bool_config = ["filter", "pic_download", "video_download"] + date_config = ["since_date"] + + for key in bool_config: + if self.config[key] not in [0, 1]: + sys.exit(f"{key}值应为0或1,请重新输入") + for key in date_config: + if not self.is_date(self.config[key]): + sys.exit(f"{key}值应为yyyy-mm-dd形式,请重新输入") + if self.config['write_mode'] not in ['txt', 'csv', 'mysql', 'mongo']: + sys.exit("write_mode值应为txt,csv,mysql,mongo,请重新输入") + + def is_date(self, since_date): + """判断日期格式是否正确""" + try: + datetime.strptime(since_date, "%Y-%m-%d") + return True + except ValueError: + return False diff --git a/weiboSpider.py b/weiboSpider.py deleted file mode 100644 index 529781ed..00000000 --- a/weiboSpider.py +++ /dev/null @@ -1,943 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- - -import codecs -import csv -import os -import random -import re -import sys -import traceback -from collections import OrderedDict -from datetime import datetime, timedelta -from time import sleep - -import requests -from lxml import etree -from requests.adapters import HTTPAdapter -from tqdm import tqdm - - -class Weibo(object): - cookie = {'Cookie': 'your cookie'} # 将your cookie替换成自己的cookie - - def __init__(self, - filter=0, - since_date='1900-01-01', - mongodb_write=0, - mysql_write=0, - pic_download=0, - video_download=0): - """Weibo类初始化""" - if filter != 0 and filter != 1: - sys.exit(u'filter值应为0或1,请重新输入') - if not self.is_date(since_date): - sys.exit(u'since_date值应为yyyy-mm-dd形式,请重新输入') - if mongodb_write != 0 and mongodb_write != 1: - sys.exit(u'mongodb_write值应为0或1,请重新输入') - if mysql_write != 0 and mysql_write != 1: - sys.exit(u'mysql_write值应为0或1,请重新输入') - if pic_download != 0 and pic_download != 1: - sys.exit(u'pic_download值应为0或1,请重新输入') - if video_download != 0 and video_download != 1: - sys.exit(u'video_download值应为0或1,请重新输入') - self.user_id = '' # 用户id,如昵称为"Dear-迪丽热巴"的id为'1669879400' - self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 - self.since_date = since_date # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd - self.mongodb_write = mongodb_write # 值为0代表不将结果写入MongoDB数据库,1代表写入 - self.mysql_write = mysql_write # 值为0代表不将结果写入MySQL数据库,1代表写入 - self.pic_download = pic_download # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 - self.video_download = video_download # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 - self.got_num = 0 # 爬取到的微博数 - self.weibo = [] # 存储爬取到的所有微博信息 - self.user = {} # 存储爬取到的用户信息 - self.weibo_id_list = [] # 存储爬取到的所有微博id - self.mysql_config = { - } # MySQL数据库连接配置,可以不填,当使用者的mysql用户名、密码等与本程序默认值不同时,需要通过mysql_config来自定义 - - def is_date(self, since_date): - """判断日期格式是否正确""" - try: - datetime.strptime(since_date, "%Y-%m-%d") - return True - except ValueError: - return False - - def deal_html(self, url): - """处理html""" - try: - html = requests.get(url, cookies=self.cookie).content - selector = etree.HTML(html) - return selector - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def deal_garbled(self, info): - """处理乱码""" - try: - info = (info.xpath('string(.)').replace(u'\u200b', '').encode( - sys.stdout.encoding, 'ignore').decode(sys.stdout.encoding)) - return info - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def get_nickname(self): - """获取用户昵称""" - try: - url = 'https://weibo.cn/%s/info' % (self.user_id) - selector = self.deal_html(url) - nickname = selector.xpath('//title/text()')[0] - nickname = nickname[:-3] - if nickname == u'登录 - 新' or nickname == u'新浪': - sys.exit(u'cookie错误或已过期,请按照README中方法重新获取') - self.user['nickname'] = nickname - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def user_to_mongodb(self): - """将爬取的用户信息写入MongoDB数据库""" - user_list = [self.user] - self.info_to_mongodb('user', user_list) - print(u'%s信息写入MongoDB数据库完毕' % self.user['nickname']) - - def user_to_mysql(self): - """将爬取的用户信息写入MySQL数据库""" - mysql_config = { - 'host': 'localhost', - 'port': 3306, - 'user': 'root', - 'password': '123456', - 'charset': 'utf8mb4' - } - # 创建'weibo'数据库 - create_database = """CREATE DATABASE IF NOT EXISTS weibo DEFAULT - CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci""" - self.mysql_create_database(mysql_config, create_database) - # 创建'user'表 - create_table = """ - CREATE TABLE IF NOT EXISTS user ( - id varchar(12) NOT NULL, - nickname varchar(30), - weibo_num INT, - following INT, - followers INT, - PRIMARY KEY (id) - ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" - self.mysql_create_table(mysql_config, create_table) - self.mysql_insert(mysql_config, 'user', [self.user]) - print(u'%s信息写入MySQL数据库完毕' % self.user['nickname']) - - def user_to_database(self): - """将用户信息写入数据库""" - if self.mysql_write: - self.user_to_mysql() - if self.mongodb_write: - self.user_to_mongodb() - - def print_user_info(self): - """打印微博用户信息""" - print(u'用户昵称: %s' % self.user['nickname']) - print(u'用户id: %s' % self.user['id']) - print(u'微博数: %d' % self.user['weibo_num']) - print(u'关注数: %d' % self.user['following']) - print(u'粉丝数: %d' % self.user['followers']) - - def get_user_info(self, selector): - """获取用户昵称、微博数、关注数、粉丝数""" - try: - self.get_nickname() # 获取用户昵称 - user_info = selector.xpath("//div[@class='tip2']/*/text()") - weibo_num = int(user_info[0][3:-1]) - following = int(user_info[1][3:-1]) - followers = int(user_info[2][3:-1]) - self.user['weibo_num'] = weibo_num - self.user['following'] = following - self.user['followers'] = followers - self.user['id'] = self.user_id - self.print_user_info() - self.user_to_database() - print('*' * 100) - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def get_page_num(self, selector): - """获取微博总页数""" - try: - if selector.xpath("//input[@name='mp']") == []: - page_num = 1 - else: - page_num = (int)( - selector.xpath("//input[@name='mp']")[0].attrib['value']) - return page_num - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def get_long_weibo(self, weibo_link): - """获取长原创微博""" - try: - selector = self.deal_html(weibo_link) - info = selector.xpath("//div[@class='c']")[1] - wb_content = self.deal_garbled(info) - wb_time = info.xpath("//span[@class='ct']/text()")[0] - weibo_content = wb_content[wb_content.find(':') + - 1:wb_content.rfind(wb_time)] - return weibo_content - except Exception as e: - return u'网络出错' - print('Error: ', e) - traceback.print_exc() - - def get_original_weibo(self, info, weibo_id): - """获取原创微博""" - try: - weibo_content = self.deal_garbled(info) - weibo_content = weibo_content[:weibo_content.rfind(u'赞')] - a_text = info.xpath('div//a/text()') - if u'全文' in a_text: - weibo_link = 'https://weibo.cn/comment/' + weibo_id - wb_content = self.get_long_weibo(weibo_link) - if wb_content: - weibo_content = wb_content - return weibo_content - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def get_long_retweet(self, weibo_link): - """获取长转发微博""" - try: - wb_content = self.get_long_weibo(weibo_link) - weibo_content = wb_content[:wb_content.rfind(u'原文转发')] - return weibo_content - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def get_retweet(self, info, weibo_id): - """获取转发微博""" - try: - original_user = info.xpath("div/span[@class='cmt']/a/text()") - if not original_user: - wb_content = u'转发微博已被删除' - return wb_content - else: - original_user = original_user[0] - wb_content = self.deal_garbled(info) - wb_content = wb_content[wb_content.find(':') + - 1:wb_content.rfind(u'赞')] - wb_content = wb_content[:wb_content.rfind(u'赞')] - a_text = info.xpath('div//a/text()') - if u'全文' in a_text: - weibo_link = 'https://weibo.cn/comment/' + weibo_id - weibo_content = self.get_long_retweet(weibo_link) - if weibo_content: - wb_content = weibo_content - retweet_reason = self.deal_garbled(info.xpath('div')[-1]) - retweet_reason = retweet_reason[:retweet_reason.rindex(u'赞')] - wb_content = (retweet_reason + '\n' + u'原始用户: ' + original_user + - '\n' + u'转发内容: ' + wb_content) - return wb_content - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def is_original(self, info): - """判断微博是否为原创微博""" - is_original = info.xpath("div/span[@class='cmt']") - if len(is_original) > 3: - return False - else: - return True - - def get_weibo_content(self, info, is_original): - """获取微博内容""" - try: - weibo_id = info.xpath('@id')[0][2:] - if is_original: - weibo_content = self.get_original_weibo(info, weibo_id) - else: - weibo_content = self.get_retweet(info, weibo_id) - return weibo_content - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def get_publish_place(self, info): - """获取微博发布位置""" - try: - div_first = info.xpath('div')[0] - a_list = div_first.xpath('a') - publish_place = u'无' - for a in a_list: - if ('place.weibo.com' in a.xpath('@href')[0] - and a.xpath('text()')[0] == u'显示地图'): - weibo_a = div_first.xpath("span[@class='ctt']/a") - if len(weibo_a) >= 1: - publish_place = weibo_a[-1] - if (u'视频' == div_first.xpath( - "span[@class='ctt']/a/text()")[-1][-2:]): - if len(weibo_a) >= 2: - publish_place = weibo_a[-2] - else: - publish_place = u'无' - publish_place = self.deal_garbled(publish_place) - break - return publish_place - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def get_publish_time(self, info): - """获取微博发布时间""" - try: - str_time = info.xpath("div/span[@class='ct']") - str_time = self.deal_garbled(str_time[0]) - publish_time = str_time.split(u'来自')[0] - if u'刚刚' in publish_time: - publish_time = datetime.now().strftime('%Y-%m-%d %H:%M') - elif u'分钟' in publish_time: - minute = publish_time[:publish_time.find(u'分钟')] - minute = timedelta(minutes=int(minute)) - publish_time = (datetime.now() - - minute).strftime('%Y-%m-%d %H:%M') - elif u'今天' in publish_time: - today = datetime.now().strftime('%Y-%m-%d') - time = publish_time[3:] - publish_time = today + ' ' + time - if len(publish_time) > 16: - publish_time = publish_time[:16] - elif u'月' in publish_time: - year = datetime.now().strftime('%Y') - month = publish_time[0:2] - day = publish_time[3:5] - time = publish_time[7:12] - publish_time = year + '-' + month + '-' + day + ' ' + time - else: - publish_time = publish_time[:16] - return publish_time - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def get_publish_tool(self, info): - """获取微博发布工具""" - try: - str_time = info.xpath("div/span[@class='ct']") - str_time = self.deal_garbled(str_time[0]) - if len(str_time.split(u'来自')) > 1: - publish_tool = str_time.split(u'来自')[1] - else: - publish_tool = u'无' - return publish_tool - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def get_weibo_footer(self, info): - """获取微博点赞数、转发数、评论数""" - try: - footer = {} - pattern = r'\d+' - str_footer = info.xpath('div')[-1] - str_footer = self.deal_garbled(str_footer) - str_footer = str_footer[str_footer.rfind(u'赞'):] - weibo_footer = re.findall(pattern, str_footer, re.M) - - up_num = int(weibo_footer[0]) - footer['up_num'] = up_num - - retweet_num = int(weibo_footer[1]) - footer['retweet_num'] = retweet_num - - comment_num = int(weibo_footer[2]) - footer['comment_num'] = comment_num - return footer - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def extract_picture_urls(self, info, weibo_id): - """提取微博原始图片url""" - try: - a_list = info.xpath('div/a/@href') - first_pic = 'https://weibo.cn/mblog/pic/' + weibo_id + '?rl=0' - all_pic = 'https://weibo.cn/mblog/picAll/' + weibo_id + '?rl=1' - if first_pic in a_list: - if all_pic in a_list: - selector = self.deal_html(all_pic) - preview_picture_list = selector.xpath('//img/@src') - picture_list = [ - p.replace('/thumb180/', '/large/') - for p in preview_picture_list - ] - picture_urls = ','.join(picture_list) - else: - if info.xpath('.//img/@src'): - preview_picture = info.xpath('.//img/@src')[-1] - picture_urls = preview_picture.replace( - '/wap180/', '/large/') - else: - sys.exit( - u"爬虫微博可能被设置成了'不显示图片',请前往" - u"'https://weibo.cn/account/customize/pic',修改为'显示'" - ) - else: - picture_urls = u'无' - return picture_urls - except Exception as e: - return u'无' - print('Error: ', e) - traceback.print_exc() - - def get_picture_urls(self, info, is_original): - """获取微博原始图片url""" - try: - weibo_id = info.xpath('@id')[0][2:] - picture_urls = {} - if is_original: - original_pictures = self.extract_picture_urls(info, weibo_id) - picture_urls['original_pictures'] = original_pictures - if not self.filter: - picture_urls['retweet_pictures'] = u'无' - else: - retweet_url = info.xpath("div/a[@class='cc']/@href")[0] - retweet_id = retweet_url.split('/')[-1].split('?')[0] - retweet_pictures = self.extract_picture_urls(info, retweet_id) - picture_urls['retweet_pictures'] = retweet_pictures - a_list = info.xpath('div[last()]/a/@href') - original_picture = u'无' - for a in a_list: - if a.endswith(('.gif', '.jpeg', '.jpg', '.png')): - original_picture = a - break - picture_urls['original_pictures'] = original_picture - return picture_urls - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def get_video_url(self, info, is_original): - """获取微博视频url""" - try: - if is_original: - div_first = info.xpath('div')[0] - a_list = div_first.xpath('.//a') - video_link = u'无' - for a in a_list: - if 'm.weibo.cn/s/video/show?object_id=' in a.xpath( - '@href')[0]: - video_link = a.xpath('@href')[0] - break - if video_link != u'无': - video_link = video_link.replace( - 'm.weibo.cn/s/video/show', 'm.weibo.cn/s/video/object') - wb_info = requests.get(video_link, - cookies=self.cookie).json() - video_url = wb_info['data']['object']['stream'].get( - 'hd_url') - if not video_url: - video_url = wb_info['data']['object']['stream']['url'] - if not video_url: # 说明该视频为直播 - video_url = u'无' - else: - video_url = u'无' - return video_url - except Exception as e: - return u'无' - print('Error: ', e) - traceback.print_exc() - - def download_one_file(self, url, file_path, type, weibo_id): - """下载单个文件(图片/视频)""" - try: - if not os.path.isfile(file_path): - s = requests.Session() - s.mount(url, HTTPAdapter(max_retries=5)) - downloaded = s.get(url, timeout=(5, 10)) - with open(file_path, 'wb') as f: - f.write(downloaded.content) - except Exception as e: - error_file = self.get_filepath( - type) + os.sep + 'not_downloaded.txt' - with open(error_file, 'ab') as f: - url = weibo_id + ':' + url + '\n' - f.write(url.encode(sys.stdout.encoding)) - print('Error: ', e) - traceback.print_exc() - - def download_files(self, type): - """下载文件(图片/视频)""" - try: - if type == 'img': - describe = u'图片' - key = 'original_pictures' - else: - describe = u'视频' - key = 'video_url' - print(u'即将进行%s下载' % describe) - file_dir = self.get_filepath(type) - for w in tqdm(self.weibo, desc=u'%s下载进度' % describe): - if w[key] != u'无': - file_prefix = w['publish_time'][:11].replace( - '-', '') + '_' + w['id'] - if type == 'img' and ',' in w[key]: - w[key] = w[key].split(',') - for j, url in enumerate(w[key]): - file_suffix = url[url.rfind('.'):] - file_name = file_prefix + '_' + str( - j + 1) + file_suffix - file_path = file_dir + os.sep + file_name - self.download_one_file(url, file_path, type, - w['id']) - else: - if type == 'video': - file_suffix = '.mp4' - else: - file_suffix = w[key][w[key].rfind('.'):] - file_name = file_prefix + file_suffix - file_path = file_dir + os.sep + file_name - self.download_one_file(w[key], file_path, type, - w['id']) - print(u'%s下载完毕,保存路径:' % describe) - print(file_dir) - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def get_one_weibo(self, info): - """获取一条微博的全部信息""" - try: - weibo = OrderedDict() - is_original = self.is_original(info) - if (not self.filter) or is_original: - weibo['id'] = info.xpath('@id')[0][2:] - weibo['content'] = self.get_weibo_content(info, - is_original) # 微博内容 - picture_urls = self.get_picture_urls(info, is_original) - weibo['original_pictures'] = picture_urls[ - 'original_pictures'] # 原创图片url - if not self.filter: - weibo['retweet_pictures'] = picture_urls[ - 'retweet_pictures'] # 转发图片url - weibo['original'] = is_original # 是否原创微博 - weibo['video_url'] = self.get_video_url(info, - is_original) # 微博视频url - weibo['publish_place'] = self.get_publish_place(info) # 微博发布位置 - weibo['publish_time'] = self.get_publish_time(info) # 微博发布时间 - weibo['publish_tool'] = self.get_publish_tool(info) # 微博发布工具 - footer = self.get_weibo_footer(info) - weibo['up_num'] = footer['up_num'] # 微博点赞数 - weibo['retweet_num'] = footer['retweet_num'] # 转发数 - weibo['comment_num'] = footer['comment_num'] # 评论数 - else: - weibo = None - return weibo - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def print_one_weibo(self, weibo): - """打印一条微博""" - print(weibo['content']) - print(u'微博发布位置:%s' % weibo['publish_place']) - print(u'发布发布时间:%s' % weibo['publish_time']) - print(u'发布发布工具:%s' % weibo['publish_tool']) - print(u'点赞数:%d' % weibo['up_num']) - print(u'转发数:%d' % weibo['retweet_num']) - print(u'评论数:%d' % weibo['comment_num']) - - def is_pinned_weibo(self, info): - """判断微博是否为置顶微博""" - kt = info.xpath(".//span[@class='kt']/text()") - if kt and kt[0] == u'置顶': - return True - else: - return False - - def get_one_page(self, page): - """获取第page页的全部微博""" - try: - url = 'https://weibo.cn/u/%s?page=%d' % (self.user_id, page) - selector = self.deal_html(url) - info = selector.xpath("//div[@class='c']") - is_exist = info[0].xpath("div/span[@class='ctt']") - if is_exist: - for i in range(0, len(info) - 2): - weibo = self.get_one_weibo(info[i]) - if weibo: - if weibo['id'] in self.weibo_id_list: - continue - publish_time = datetime.strptime( - weibo['publish_time'][:10], "%Y-%m-%d") - since_date = datetime.strptime(self.since_date, - "%Y-%m-%d") - if publish_time < since_date: - if self.is_pinned_weibo(info[i]): - continue - else: - return True - self.print_one_weibo(weibo) - self.weibo.append(weibo) - self.weibo_id_list.append(weibo['id']) - self.got_num += 1 - print('-' * 100) - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def get_filepath(self, type): - """获取结果文件路径""" - try: - file_dir = os.path.split( - os.path.realpath(__file__) - )[0] + os.sep + 'weibo' + os.sep + self.user['nickname'] - if type == 'img' or type == 'video': - file_dir = file_dir + os.sep + type - if not os.path.isdir(file_dir): - os.makedirs(file_dir) - if type == 'img' or type == 'video': - return file_dir - file_path = file_dir + os.sep + self.user_id + '.' + type - return file_path - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def write_csv(self, wrote_num): - """将爬取的信息写入csv文件""" - try: - result_headers = [ - '微博id', - '微博正文', - '原始图片url', - '微博视频url', - '发布位置', - '发布时间', - '发布工具', - '点赞数', - '转发数', - '评论数', - ] - if not self.filter: - result_headers.insert(3, '被转发微博原始图片url') - result_headers.insert(4, '是否为原创微博') - result_data = [w.values() for w in self.weibo][wrote_num:] - if sys.version < '3': # python2.x - reload(sys) - sys.setdefaultencoding('utf-8') - with open(self.get_filepath('csv'), 'ab') as f: - f.write(codecs.BOM_UTF8) - writer = csv.writer(f) - if wrote_num == 0: - writer.writerows([result_headers]) - writer.writerows(result_data) - else: # python3.x - with open(self.get_filepath('csv'), - 'a', - encoding='utf-8-sig', - newline='') as f: - writer = csv.writer(f) - if wrote_num == 0: - writer.writerows([result_headers]) - writer.writerows(result_data) - print(u'%d条微博写入csv文件完毕,保存路径:' % self.got_num) - print(self.get_filepath('csv')) - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def write_txt(self, wrote_num): - """将爬取的信息写入txt文件""" - try: - temp_result = [] - if wrote_num == 0: - if self.filter: - result_header = u'\n\n原创微博内容: \n' - else: - result_header = u'\n\n微博内容: \n' - result_header = (u'用户信息\n用户昵称:' + self.user['nickname'] + - u'\n用户id: ' + str(self.user_id) + u'\n微博数: ' + - str(self.user['weibo_num']) + u'\n关注数: ' + - str(self.user['following']) + u'\n粉丝数: ' + - str(self.user['followers']) + result_header) - temp_result.append(result_header) - for i, w in enumerate(self.weibo[wrote_num:]): - temp_result.append( - str(wrote_num + i + 1) + ':' + w['content'] + '\n' + - u'微博位置: ' + w['publish_place'] + '\n' + u'发布时间: ' + - w['publish_time'] + '\n' + u'点赞数: ' + str(w['up_num']) + - u' 转发数: ' + str(w['retweet_num']) + u' 评论数: ' + - str(w['comment_num']) + '\n' + u'发布工具: ' + - w['publish_tool'] + '\n\n') - result = ''.join(temp_result) - with open(self.get_filepath('txt'), 'ab') as f: - f.write(result.encode(sys.stdout.encoding)) - print(u'%d条微博写入txt文件完毕,保存路径:' % self.got_num) - print(self.get_filepath('txt')) - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def info_to_mongodb(self, collection, info_list): - """将爬取的信息写入MongoDB数据库""" - from pymongo import MongoClient - - client = MongoClient() - db = client['weibo'] - collection = db[collection] - for info in info_list: - if not collection.find_one({'id': info['id']}): - collection.insert_one(info) - else: - collection.update_one({'id': info['id']}, {'$set': info}) - - def weibo_to_mongodb(self, wrote_num): - """将爬取的微博信息写入MongoDB数据库""" - weibo_list = [] - for w in self.weibo[wrote_num:]: - w['user_id'] = self.user_id - weibo_list.append(w) - self.info_to_mongodb('weibo', weibo_list) - print(u'%d条微博写入MongoDB数据库完毕' % self.got_num) - - def change_mysql_config(self, mysql_config): - """修改MySQL数据库连接配置""" - self.mysql_config = mysql_config - - def mysql_create(self, connection, sql): - """创建MySQL数据库或表""" - try: - with connection.cursor() as cursor: - cursor.execute(sql) - finally: - connection.close() - - def mysql_create_database(self, mysql_config, sql): - """创建MySQL数据库""" - import pymysql - - if self.mysql_config: - mysql_config = self.mysql_config - connection = pymysql.connect(**mysql_config) - self.mysql_create(connection, sql) - - def mysql_create_table(self, mysql_config, sql): - """创建MySQL表""" - import pymysql - - if self.mysql_config: - mysql_config = self.mysql_config - mysql_config['db'] = 'weibo' - connection = pymysql.connect(**mysql_config) - self.mysql_create(connection, sql) - - def mysql_insert(self, mysql_config, table, data_list): - """向MySQL表插入或更新数据""" - import pymysql - - if len(data_list) > 0: - keys = ', '.join(data_list[0].keys()) - values = ', '.join(['%s'] * len(data_list[0])) - if self.mysql_config: - mysql_config = self.mysql_config - mysql_config['db'] = 'weibo' - connection = pymysql.connect(**mysql_config) - cursor = connection.cursor() - sql = """INSERT INTO {table}({keys}) VALUES ({values}) ON - DUPLICATE KEY UPDATE""".format(table=table, - keys=keys, - values=values) - update = ','.join([ - " {key} = values({key})".format(key=key) - for key in data_list[0] - ]) - sql += update - try: - cursor.executemany( - sql, [tuple(data.values()) for data in data_list]) - connection.commit() - except Exception as e: - connection.rollback() - print('Error: ', e) - traceback.print_exc() - finally: - connection.close() - - def weibo_to_mysql(self, wrote_num): - """将爬取的微博信息写入MySQL数据库""" - mysql_config = { - 'host': 'localhost', - 'port': 3306, - 'user': 'root', - 'password': '123456', - 'charset': 'utf8mb4' - } - # 创建'weibo'表 - create_table = """ - CREATE TABLE IF NOT EXISTS weibo ( - id varchar(10) NOT NULL, - user_id varchar(12), - content varchar(2000), - original_pictures varchar(1000), - retweet_pictures varchar(1000), - original BOOLEAN NOT NULL DEFAULT 1, - video_url varchar(300), - publish_place varchar(100), - publish_time DATETIME NOT NULL, - publish_tool varchar(30), - up_num INT NOT NULL, - retweet_num INT NOT NULL, - comment_num INT NOT NULL, - PRIMARY KEY (id) - ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" - self.mysql_create_table(mysql_config, create_table) - # 在'weibo'表中插入或更新微博数据 - weibo_list = [] - for weibo in self.weibo[wrote_num:]: - weibo['user_id'] = self.user_id - weibo_list.append(weibo) - self.mysql_insert(mysql_config, 'weibo', weibo_list) - print(u'%d条微博写入MySQL数据库完毕' % self.got_num) - - def write_data(self, wrote_num): - """将爬取到的信息写入文件或数据库""" - if self.got_num > wrote_num: - self.write_csv(wrote_num) - self.write_txt(wrote_num) - if self.mysql_write: - self.weibo_to_mysql(wrote_num) - if self.mongodb_write: - self.weibo_to_mongodb(wrote_num) - - def get_weibo_info(self): - """获取微博信息""" - try: - url = 'https://weibo.cn/u/%s' % (self.user_id) - selector = self.deal_html(url) - self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 - page_num = self.get_page_num(selector) # 获取微博总页数 - wrote_num = 0 - page1 = 0 - random_pages = random.randint(1, 5) - for page in tqdm(range(1, page_num + 1), desc=u'进度'): - is_end = self.get_one_page(page) # 获取第page页的全部微博 - if is_end: - break - - if page % 20 == 0: # 每爬20页写入一次文件 - self.write_data(wrote_num) - wrote_num = self.got_num - - # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限 - # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默 - # 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间 - if page - page1 == random_pages and page < page_num: - sleep(random.randint(6, 10)) - page1 = page - random_pages = random.randint(1, 5) - - self.write_data(wrote_num) # 将剩余不足20页的微博写入文件 - if not self.filter: - print(u'共爬取' + str(self.got_num) + u'条微博') - else: - print(u'共爬取' + str(self.got_num) + u'条原创微博') - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def get_user_list(self, file_name): - """获取文件中的微博id信息""" - with open(file_name, 'rb') as f: - lines = f.read().splitlines() - lines = [line.decode('utf-8') for line in lines] - user_id_list = [ - line.split(' ')[0] for line in lines - if len(line.split(' ')) > 0 and line.split(' ')[0].isdigit() - ] - return user_id_list - - def initialize_info(self, user_id): - """初始化爬虫信息""" - self.got_num = 0 - self.weibo = [] - self.user = {} - self.user_id = user_id - self.weibo_id_list = [] - - def start(self, user_id_list): - """运行爬虫""" - try: - for user_id in user_id_list: - self.initialize_info(user_id) - print('*' * 100) - self.get_weibo_info() - print(u'信息抓取完毕') - print('*' * 100) - if self.pic_download == 1: - self.download_files('img') - if self.video_download == 1: - self.download_files('video') - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - -def main(): - try: - # 以下是程序配置信息,可以根据自己需求修改 - filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 - since_date = '2018-01-01' # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd - """值为0代表不将结果写入MongoDB数据库,1代表写入;若要写入MongoDB数据库, - 请先安装MongoDB数据库和pymongo,pymongo安装方法为命令行运行:pip install pymongo""" - mongodb_write = 0 - """值为0代表不将结果写入MySQL数据库,1代表写入;若要写入MySQL数据库, - 请先安装MySQL数据库和pymysql,pymysql安装方法为命令行运行:pip install pymysql""" - mysql_write = 0 - pic_download = 1 # 值为0代表不下载微博原始图片,1代表下载微博原始图片 - video_download = 1 # 值为0代表不下载微博视频,1代表下载微博视频 - - wb = Weibo(filter, since_date, mongodb_write, mysql_write, - pic_download, video_download) - - # 下面是自定义MySQL数据库连接配置(可选) - """因为操作MySQL数据库需要用户名、密码等参数,本程序默认为: - mysql_config = { - 'host': 'localhost', - 'port': 3306, - 'user': 'root', - 'password': '123456', - 'charset': 'utf8mb4' - } - 大家的参数配置如果和默认值不同,可以将上面的参数值替换成自己的, - 然后添加如下代码,使修改生效,如果你的参数和默认值相同则不需要下面的代码: - wb.change_mysql_config(mysql_config)""" - - # 下面是配置user_id_list - """user_id_list包含了要爬的目标微博id,可以是一个,也可以是多个,也可以从文件中读取 - 爬单个微博,user_id_list如下所示,可以改成任意合法的用户id - user_id_list = ['1669879400'] - 爬多个微博,user_id_list如下所示,可以改成任意合法的用户id - user_id_list = ['1669879400', '1729370543'] - 也可以在文件中读取user_id_list,文件中可以包含很多user_id, - 每个user_id占一行,也可以在user_id后面加注释,如用户昵称,user_id和注释之间必需要有空格, - 文件名任意,类型为txt,位置位于本程序的同目录下,文件内容可以为如下形式: - 1223178222 胡歌 - 1669879400 迪丽热巴 - 1729370543 郭碧婷 - 比如文件可以叫user_id_list.txt,读取文件中的user_id_list如下所示: - user_id_list = wb.get_user_list('user_id_list.txt')""" - user_id_list = ['1669879400'] - - wb.start(user_id_list) # 爬取微博信息 - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - -if __name__ == '__main__': - main() diff --git a/writer.py b/writer.py new file mode 100644 index 00000000..6c1b7a53 --- /dev/null +++ b/writer.py @@ -0,0 +1,261 @@ +import csv +import os +import sys +import traceback + + +def get_filepath(type, nickname): + """获取结果文件路径""" + file_dir = os.path.split( + os.path.realpath(__file__))[0] + os.sep + 'weibo' + os.sep + nickname + if type == 'img' or type == 'video': + file_dir = file_dir + os.sep + type + if not os.path.isdir(file_dir): + os.makedirs(file_dir) + if type == 'img' or type == 'video': + return file_dir + file_path = file_dir + os.sep + nickname + '.' + type + return file_path + + +class Writer: + def __init__(self, config): + write_mode = config['write_mode'] + if write_mode == 'txt': + self.writer = TxtWriter(config) + elif write_mode == 'csv': + self.writer = CsvWriter(config) + elif write_mode == 'mysql': + self.writer = MysqlWriter(config) + elif write_mode == 'mongo': + self.writer = MongoWriter(config) + + def write_user(self, user): + self.writer.write_user(user) + + def write_weibo(self, weibo): + self.writer.write_weibo(weibo) + + +class TxtWriter: + def __init__(self, config): + self.config = config + + def write_user(self, user): + self.user = user + if self.config['filter']: + result_header = u'\n\n原创微博内容: \n' + else: + result_header = u'\n\n微博内容: \n' + result_header = (u'用户信息\n用户昵称:' + user['nickname'] + u'\n用户id: ' + + str(user['user_id']) + u'\n微博数: ' + + str(user['weibo_num']) + u'\n关注数: ' + + str(user['following']) + u'\n粉丝数: ' + + str(user['followers']) + result_header) + + with open(get_filepath('txt', user['nickname']), 'ab') as f: + f.write(result_header.encode(sys.stdout.encoding)) + + def write_weibo(self, weibo): + """将爬取的信息写入txt文件""" + + temp_result = [] + for w in weibo: + temp_result.append(w['content'] + '\n' + u'微博位置: ' + + w['publish_place'] + '\n' + u'发布时间: ' + + w['publish_time'] + '\n' + u'点赞数: ' + + str(w['up_num']) + u' 转发数: ' + + str(w['retweet_num']) + u' 评论数: ' + + str(w['comment_num']) + '\n' + u'发布工具: ' + + w['publish_tool'] + '\n\n') + result = ''.join(temp_result) + with open(get_filepath('txt', self.user['nickname']), 'ab') as f: + f.write(result.encode(sys.stdout.encoding)) + print(u'%d条微博写入txt文件完毕,保存路径:' % len(weibo)) + print(get_filepath('txt', self.user['nickname'])) + + +class CsvWriter: + def __init__(self, config): + self.config = config + + def write_user(self, user): + self.user = user + result_headers = [ + '微博id', + '微博正文', + '原始图片url', + '微博视频url', + '发布位置', + '发布时间', + '发布工具', + '点赞数', + '转发数', + '评论数', + ] + if not self.config['filter']: + result_headers.insert(3, '被转发微博原始图片url') + result_headers.insert(4, '是否为原创微博') + with open(get_filepath('csv', self.user['nickname']), + 'a', + encoding='utf-8-sig', + newline='') as f: + csv_writer = csv.writer(f) + csv_writer.writerows([result_headers]) + + def write_weibo(self, weibo): + """将爬取的信息写入csv文件""" + result_data = [w.values() for w in weibo] + with open(get_filepath('csv', self.user['nickname']), + 'a', + encoding='utf-8-sig', + newline='') as f: + csv_writer = csv.writer(f) + csv_writer.writerows(result_data) + + print(u'%d条微博写入csv文件完毕,保存路径:' % len(weibo)) + print(get_filepath('csv', self.user['nickname'])) + + +class MongoWriter: + def __init__(self, config): + self.config = config + + def write_user(self, user): + """将爬取的用户信息写入MongoDB数据库""" + self.user = user + + user_list = [user] + self.info_to_mongodb('user', user_list) + print(u'%s信息写入MongoDB数据库完毕' % user['nickname']) + + def info_to_mongodb(self, collection, info_list): + """将爬取的信息写入MongoDB数据库""" + from pymongo import MongoClient + + client = MongoClient() + db = client['weibo'] + collection = db[collection] + for info in info_list: + if not collection.find_one({'id': info['id']}): + collection.insert_one(info) + else: + collection.update_one({'id': info['id']}, {'$set': info}) + + def weibo_to_mongodb(self, weibo): + """将爬取的微博信息写入MongoDB数据库""" + weibo_list = [] + for w in weibo: + w['user_id'] = self.user['user_id'] + weibo_list.append(w) + self.info_to_mongodb('weibo', weibo_list) + print(u'%d条微博写入MongoDB数据库完毕' % len(weibo)) + + +class MysqlWriter: + def __init__(self, config): + self.config = config + + def write_user(self, user): + """将爬取的用户信息写入MySQL数据库""" + self.user = user + # 创建'weibo'数据库 + create_database = """CREATE DATABASE IF NOT EXISTS weibo DEFAULT + CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci""" + self.mysql_create_database(create_database) + # 创建'user'表 + create_table = """ + CREATE TABLE IF NOT EXISTS user ( + id varchar(12) NOT NULL, + nickname varchar(30), + weibo_num INT, + following INT, + followers INT, + PRIMARY KEY (id) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" + self.mysql_create_table(create_table) + self.mysql_insert('user', [user]) + print(u'%s信息写入MySQL数据库完毕' % user['nickname']) + + def write_weibo(self, weibo): + """将爬取的微博信息写入MySQL数据库""" + # 创建'weibo'表 + create_table = """ + CREATE TABLE IF NOT EXISTS weibo ( + id varchar(10) NOT NULL, + user_id varchar(12), + content varchar(2000), + original_pictures varchar(1000), + retweet_pictures varchar(1000), + original BOOLEAN NOT NULL DEFAULT 1, + video_url varchar(300), + publish_place varchar(100), + publish_time DATETIME NOT NULL, + publish_tool varchar(30), + up_num INT NOT NULL, + retweet_num INT NOT NULL, + comment_num INT NOT NULL, + PRIMARY KEY (id) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" + self.mysql_create_table(create_table) + # 在'weibo'表中插入或更新微博数据 + weibo_list = [] + for w in weibo: + w['user_id'] = self.user['user_id'] + weibo_list.append(w) + self.mysql_insert('weibo', weibo_list) + print(u'%d条微博写入MySQL数据库完毕' % len(weibo)) + + def mysql_create(self, connection, sql): + """创建MySQL数据库或表""" + try: + with connection.cursor() as cursor: + cursor.execute(sql) + finally: + connection.close() + + def mysql_create_database(self, sql): + """创建MySQL数据库""" + import pymysql + mysql_config = self.config['mysql_config'] + connection = pymysql.connect(**mysql_config) + self.mysql_create(connection, sql) + + def mysql_create_table(self, sql): + """创建MySQL表""" + import pymysql + mysql_config = self.config['mysql_config'] + mysql_config['db'] = 'weibo' + connection = pymysql.connect(**mysql_config) + self.mysql_create(connection, sql) + + def mysql_insert(self, table, data_list): + """向MySQL表插入或更新数据""" + import pymysql + mysql_config = self.config['mysql_config'] + + if len(data_list) > 0: + keys = ', '.join(data_list[0].keys()) + values = ', '.join(['%s'] * len(data_list[0])) + mysql_config['db'] = 'weibo' + connection = pymysql.connect(**mysql_config) + cursor = connection.cursor() + sql = """INSERT INTO {table}({keys}) VALUES ({values}) ON + DUPLICATE KEY UPDATE""".format(table=table, + keys=keys, + values=values) + update = ','.join([ + " {key} = values({key})".format(key=key) + for key in data_list[0] + ]) + sql += update + try: + cursor.executemany( + sql, [tuple(data.values()) for data in data_list]) + connection.commit() + except Exception as e: + connection.rollback() + print('Error: ', e) + traceback.print_exc() + finally: + connection.close() \ No newline at end of file From a23d719a37031e461d40a86d919916fcd63aeb4c Mon Sep 17 00:00:00 2001 From: songzy Date: Wed, 13 Nov 2019 23:07:53 +0800 Subject: [PATCH 084/363] 1. fix csv headers order 2. support write mode list --- config_sample.json | 2 +- validator.py | 5 +++-- writer.py | 34 +++++++++++++++++++--------------- 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/config_sample.json b/config_sample.json index 5af9b697..cc50474a 100644 --- a/config_sample.json +++ b/config_sample.json @@ -1,7 +1,7 @@ { "filter": 1, "since_date": "2018-01-01", - "write_mode": "txt", + "write_mode": ["csv", "txt"], "pic_download": 1, "video_download": 1, "cookie": "your cookie", diff --git a/validator.py b/validator.py index aecb7873..5d813e6e 100644 --- a/validator.py +++ b/validator.py @@ -26,8 +26,9 @@ def validate(self): for key in date_config: if not self.is_date(self.config[key]): sys.exit(f"{key}值应为yyyy-mm-dd形式,请重新输入") - if self.config['write_mode'] not in ['txt', 'csv', 'mysql', 'mongo']: - sys.exit("write_mode值应为txt,csv,mysql,mongo,请重新输入") + for mode in self.config['write_mode']: + if mode not in ['txt', 'csv', 'mysql', 'mongo']: + sys.exit("write_mode值应为txt,csv,mysql,mongo,请重新输入") def is_date(self, since_date): """判断日期格式是否正确""" diff --git a/writer.py b/writer.py index 6c1b7a53..e08b85b8 100644 --- a/writer.py +++ b/writer.py @@ -21,20 +21,24 @@ def get_filepath(type, nickname): class Writer: def __init__(self, config): write_mode = config['write_mode'] - if write_mode == 'txt': - self.writer = TxtWriter(config) - elif write_mode == 'csv': - self.writer = CsvWriter(config) - elif write_mode == 'mysql': - self.writer = MysqlWriter(config) - elif write_mode == 'mongo': - self.writer = MongoWriter(config) + self.writers = [] + + if 'txt' in write_mode: + self.writers.append(TxtWriter(config)) + if 'csv' in write_mode: + self.writers.append(CsvWriter(config)) + if 'mysql' in write_mode: + self.writers.append(MysqlWriter(config)) + if 'mongo' in write_mode: + self.writers.append(MongoWriter(config)) def write_user(self, user): - self.writer.write_user(user) + for writer in self.writers: + writer.write_user(user) def write_weibo(self, weibo): - self.writer.write_weibo(weibo) + for writer in self.writers: + writer.write_weibo(weibo) class TxtWriter: @@ -84,18 +88,18 @@ def write_user(self, user): result_headers = [ '微博id', '微博正文', - '原始图片url', - '微博视频url', '发布位置', '发布时间', '发布工具', '点赞数', '转发数', '评论数', + '原始图片url', + '微博视频url', ] if not self.config['filter']: - result_headers.insert(3, '被转发微博原始图片url') - result_headers.insert(4, '是否为原创微博') + result_headers.insert(-1, '被转发微博原始图片url') + result_headers.insert(-1, '是否为原创微博') with open(get_filepath('csv', self.user['nickname']), 'a', encoding='utf-8-sig', @@ -258,4 +262,4 @@ def mysql_insert(self, table, data_list): print('Error: ', e) traceback.print_exc() finally: - connection.close() \ No newline at end of file + connection.close() From 6a8b1fd50cb71c16caf7342281059dd602e68e89 Mon Sep 17 00:00:00 2001 From: songzy Date: Sat, 16 Nov 2019 14:22:51 +0800 Subject: [PATCH 085/363] four updates based on comments from last pull request: 1. remove unused imports 2. support user_id_list file 3. remove f expression, support python3.5 4. support since_date to be days from today --- config_sample.json | 6 ++---- downloader.py | 4 ++-- html_parser.py | 9 ++++----- spider.py | 22 +++++++++++++--------- user_id_list.txt | 1 + validator.py | 31 +++++++++++++++++-------------- 6 files changed, 39 insertions(+), 34 deletions(-) create mode 100644 user_id_list.txt diff --git a/config_sample.json b/config_sample.json index cc50474a..0f9f20b1 100644 --- a/config_sample.json +++ b/config_sample.json @@ -1,6 +1,6 @@ { "filter": 1, - "since_date": "2018-01-01", + "since_date": 10, "write_mode": ["csv", "txt"], "pic_download": 1, "video_download": 1, @@ -12,7 +12,5 @@ "password": "123456", "charset": "utf8mb4" }, - "user_id_list": [ - "1669879400" - ] + "user_id_list": "user_id_list.txt" } \ No newline at end of file diff --git a/downloader.py b/downloader.py index 926226bf..d1621cbc 100644 --- a/downloader.py +++ b/downloader.py @@ -1,10 +1,11 @@ import os import sys -from tqdm import tqdm import traceback import requests from requests.adapters import HTTPAdapter +from tqdm import tqdm + class Downloader: def __init__(self, config): @@ -13,7 +14,6 @@ def __init__(self, config): def download_files(self, file_path, type, weibo): """下载文件(图片/视频)""" try: - if type == 'img': describe = u'图片' key = 'original_pictures' diff --git a/html_parser.py b/html_parser.py index f82b9986..ae27351c 100644 --- a/html_parser.py +++ b/html_parser.py @@ -1,4 +1,3 @@ -import os import requests import sys from lxml import etree @@ -53,7 +52,7 @@ def extract_picture_urls(self, info, weibo_id): else: picture_urls = u'无' return picture_urls - except Exception as e: + except Exception: return u'无' def get_picture_urls(self, info, is_original): @@ -109,7 +108,7 @@ def get_video_url(self, info, is_original): else: video_url = u'无' return video_url - except Exception as e: + except Exception: return u'无' def get_page_num(self, selector): @@ -205,7 +204,7 @@ def get_publish_place(self, info): if len(weibo_a) >= 1: publish_place = weibo_a[-1] if (u'视频' == div_first.xpath("span[@class='ctt']/a/text()") - [-1][-2:]): + [-1][-2:]): if len(weibo_a) >= 2: publish_place = weibo_a[-2] else: @@ -322,4 +321,4 @@ def is_pinned_weibo(self, info): if kt and kt[0] == u'置顶': return True else: - return False \ No newline at end of file + return False diff --git a/spider.py b/spider.py index 8faed152..fb6cfd83 100644 --- a/spider.py +++ b/spider.py @@ -1,19 +1,11 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- -import codecs -import csv -import os import random -import re import sys -import traceback -from collections import OrderedDict -from datetime import datetime, timedelta +from datetime import datetime, date, timedelta from time import sleep -from lxml import etree -import requests from tqdm import tqdm from validator import Validator @@ -33,6 +25,18 @@ def __init__(self, config): t.strip().split("=")[0]: t.strip().split("=")[1] for t in self.config['cookie'].split(";") } + if type(self.config['user_id_list']) == type(""): + with open(self.config['user_id_list'], 'rb') as f: + lines = f.read().splitlines() + lines = [line.decode('utf-8') for line in lines] + self.config['user_id_list'] = [ + line.split(' ')[0] for line in lines if + len(line.split(' ')) > 0 and line.split(' ')[0].isdigit() + ] + if type(self.config['since_date']) == type(0): + self.config['since_date'] = str( + date.today() - timedelta(self.config['since_date'])) + self.validator = Validator(self.config) self.validator.validate() self.printer = Printer() diff --git a/user_id_list.txt b/user_id_list.txt new file mode 100644 index 00000000..3acc69f8 --- /dev/null +++ b/user_id_list.txt @@ -0,0 +1 @@ +7053204102 majiko \ No newline at end of file diff --git a/validator.py b/validator.py index 5d813e6e..86d12f1a 100644 --- a/validator.py +++ b/validator.py @@ -1,12 +1,22 @@ from datetime import datetime import sys + +def is_date(since_date): + """判断日期格式是否正确""" + try: + datetime.strptime(since_date, "%Y-%m-%d") + return True + except: + return False + + class Validator: def __init__(self, config): """ - self.user_id = '' # 用户id,如昵称为"Dear-迪丽热巴"的id为'1669879400' + self.user_id_list = '' # 1. 用户id list,如昵称为"Dear-迪丽热巴"的id为'1669879400';2. 存储用户id list 的文件名 + self.since_date = since_date # 1. 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd 2. 起始时间距离今天的天数,形式为一个整数 self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 - self.since_date = since_date # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd self.mongodb_write = mongodb_write # 值为0代表不将结果写入MongoDB数据库,1代表写入 self.mysql_write = mysql_write # 值为0代表不将结果写入MySQL数据库,1代表写入 self.pic_download = pic_download # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 @@ -22,18 +32,11 @@ def validate(self): for key in bool_config: if self.config[key] not in [0, 1]: - sys.exit(f"{key}值应为0或1,请重新输入") + sys.exit("%s值应为0或1,请重新输入" % key) for key in date_config: - if not self.is_date(self.config[key]): - sys.exit(f"{key}值应为yyyy-mm-dd形式,请重新输入") + if not (type(self.config[key]) == type(0) + or is_date(self.config[key])): + sys.exit("%s值应为yyyy-mm-dd形式或整数,请重新输入" % key) for mode in self.config['write_mode']: if mode not in ['txt', 'csv', 'mysql', 'mongo']: - sys.exit("write_mode值应为txt,csv,mysql,mongo,请重新输入") - - def is_date(self, since_date): - """判断日期格式是否正确""" - try: - datetime.strptime(since_date, "%Y-%m-%d") - return True - except ValueError: - return False + sys.exit("write_mode值应为txt,csv,mysql,mongo,请重新输入") \ No newline at end of file From 65243795721e5e47df8dac239a153f52a34f128f Mon Sep 17 00:00:00 2001 From: chenlei Date: Sat, 16 Nov 2019 22:31:19 +0800 Subject: [PATCH 086/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=88=A4?= =?UTF-8?q?=E6=96=ADMongoDB=E5=92=8Cpymongo=E6=98=AF=E5=90=A6=E5=AE=89?= =?UTF-8?q?=E8=A3=85=E6=88=96=E5=90=AF=E5=8A=A8=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 若使用者配置程序为将结果写入MongoDB数据库,且MongoDB或pymongo没有安装/启动,则给出相应提示 --- weiboSpider.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 529781ed..48b17449 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -685,16 +685,22 @@ def write_txt(self, wrote_num): def info_to_mongodb(self, collection, info_list): """将爬取的信息写入MongoDB数据库""" - from pymongo import MongoClient - - client = MongoClient() - db = client['weibo'] - collection = db[collection] - for info in info_list: - if not collection.find_one({'id': info['id']}): - collection.insert_one(info) - else: - collection.update_one({'id': info['id']}, {'$set': info}) + try: + import pymongo + except ImportError: + sys.exit(u'系统中可能没有安装pymongo库,请先运行 pip install pymongo ,再运行程序') + try: + from pymongo import MongoClient + client = MongoClient() + db = client['weibo'] + collection = db[collection] + for info in info_list: + if not collection.find_one({'id': info['id']}): + collection.insert_one(info) + else: + collection.update_one({'id': info['id']}, {'$set': info}) + except pymongo.errors.ServerSelectionTimeoutError: + sys.exit(u'系统中可能没有安装或启动MongoDB数据库,请先根据系统环境安装或启动MongoDB,再运行程序') def weibo_to_mongodb(self, wrote_num): """将爬取的微博信息写入MongoDB数据库""" From 14b1b0188c72070f30bb8d8385e1affec4d10c84 Mon Sep 17 00:00:00 2001 From: chenlei Date: Sun, 17 Nov 2019 19:28:05 +0800 Subject: [PATCH 087/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=9C=A8?= =?UTF-8?q?=E6=89=A7=E8=A1=8C=E5=86=99=E5=85=A5mysql=E6=97=B6=E5=88=A4?= =?UTF-8?q?=E6=96=ADmysql=E7=8E=AF=E5=A2=83=E6=98=AF=E5=90=A6=E6=AD=A3?= =?UTF-8?q?=E7=A1=AE=E8=BF=90=E8=A1=8C=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 若使用者配置将结果写入mysql且运行出错,将出错的可能原因告终使用者 --- weiboSpider.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 48b17449..ab5ccd4f 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -725,12 +725,17 @@ def mysql_create(self, connection, sql): def mysql_create_database(self, mysql_config, sql): """创建MySQL数据库""" - import pymysql - - if self.mysql_config: - mysql_config = self.mysql_config - connection = pymysql.connect(**mysql_config) - self.mysql_create(connection, sql) + try: + import pymysql + except ImportError: + sys.exit(u'系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序') + try: + if self.mysql_config: + mysql_config = self.mysql_config + connection = pymysql.connect(**mysql_config) + self.mysql_create(connection, sql) + except pymysql.OperationalError: + sys.exit(u'系统中可能没有安装或正确配置MySQL数据库,请先根据系统环境安装或配置MySQL,再运行程序') def mysql_create_table(self, mysql_config, sql): """创建MySQL表""" From 52f32fefb49994ef93640addd8737dbea7c0666b Mon Sep 17 00:00:00 2001 From: songzy Date: Sun, 17 Nov 2019 19:30:39 +0800 Subject: [PATCH 088/363] several updates: 1. fix mongo 2. fix mysql 3. support python2 --- downloader.py | 3 +- html_parser.py | 4 ++- printer.py | 5 ++- spider.py | 12 +++---- validator.py | 4 ++- writer.py | 95 +++++++++++++++++++++++++++++++------------------- 6 files changed, 78 insertions(+), 45 deletions(-) diff --git a/downloader.py b/downloader.py index d1621cbc..87fc1595 100644 --- a/downloader.py +++ b/downloader.py @@ -1,9 +1,10 @@ +# -*- coding: UTF-8 -*- import os import sys import traceback + import requests from requests.adapters import HTTPAdapter - from tqdm import tqdm diff --git a/html_parser.py b/html_parser.py index ae27351c..3412e4aa 100644 --- a/html_parser.py +++ b/html_parser.py @@ -1,4 +1,4 @@ -import requests +# -*- coding: UTF-8 -*- import sys from lxml import etree import traceback @@ -6,6 +6,8 @@ from datetime import datetime, timedelta from collections import OrderedDict +import requests + class Parser: def __init__(self, config): diff --git a/printer.py b/printer.py index 8c126bf6..5f87bdbd 100644 --- a/printer.py +++ b/printer.py @@ -1,3 +1,6 @@ +# -*- coding: UTF-8 -*- + + class Printer: def print_one_weibo(self, weibo): """打印一条微博""" @@ -12,7 +15,7 @@ def print_one_weibo(self, weibo): def print_user_info(self, user): """打印微博用户信息""" print(u'用户昵称: %s' % user['nickname']) - print(u'用户id: %s' % user['user_id']) + print(u'用户id: %s' % user['id']) print(u'微博数: %d' % user['weibo_num']) print(u'关注数: %d' % user['following']) print(u'粉丝数: %d' % user['followers']) diff --git a/spider.py b/spider.py index fb6cfd83..fb184a58 100644 --- a/spider.py +++ b/spider.py @@ -20,12 +20,12 @@ def __init__(self, config): """Weibo类初始化""" self.config = config # change cookie from string to dict - if type(self.config['cookie']) == type(''): + if type(self.config['cookie']) == type(u''): self.config['cookie'] = { t.strip().split("=")[0]: t.strip().split("=")[1] for t in self.config['cookie'].split(";") } - if type(self.config['user_id_list']) == type(""): + if type(self.config['user_id_list']) == type(u""): with open(self.config['user_id_list'], 'rb') as f: lines = f.read().splitlines() lines = [line.decode('utf-8') for line in lines] @@ -46,7 +46,7 @@ def __init__(self, config): def get_nickname(self): """获取用户昵称""" - url = 'https://weibo.cn/%s/info' % (self.user['user_id']) + url = 'https://weibo.cn/%s/info' % (self.user['id']) selector = self.parser.deal_html(url, self.config['cookie']) nickname = selector.xpath('//title/text()')[0] nickname = nickname[:-3] @@ -68,7 +68,7 @@ def get_user_info(self, selector): def get_one_page(self, page): """获取第page页的全部微博""" - url = 'https://weibo.cn/u/%s?page=%d' % (self.user['user_id'], page) + url = 'https://weibo.cn/u/%s?page=%d' % (self.user['id'], page) selector = self.parser.deal_html(url, self.config['cookie']) info = selector.xpath("//div[@class='c']") is_exist = info[0].xpath("div/span[@class='ctt']") @@ -98,7 +98,7 @@ def get_one_page(self, page): def get_weibo_info(self): """获取微博信息""" - url = 'https://weibo.cn/u/%s' % (self.user['user_id']) + url = 'https://weibo.cn/u/%s' % (self.user['id']) selector = self.parser.deal_html(url, self.config['cookie']) self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 @@ -127,7 +127,7 @@ def initialize_info(self, user_id): """初始化爬虫信息""" self.got_num = 0 # 爬取到的微博数 self.weibo = [] # 存储爬取到的所有微博信息 - self.user = {'user_id': user_id} # 存储爬取到的用户信息 + self.user = {'id': user_id} # 存储爬取到的用户信息 self.weibo_id_list = [] # 存储爬取到的所有微博id def start(self): diff --git a/validator.py b/validator.py index 86d12f1a..3d31b437 100644 --- a/validator.py +++ b/validator.py @@ -1,3 +1,5 @@ +# -*- coding: UTF-8 -*- + from datetime import datetime import sys @@ -39,4 +41,4 @@ def validate(self): sys.exit("%s值应为yyyy-mm-dd形式或整数,请重新输入" % key) for mode in self.config['write_mode']: if mode not in ['txt', 'csv', 'mysql', 'mongo']: - sys.exit("write_mode值应为txt,csv,mysql,mongo,请重新输入") \ No newline at end of file + sys.exit("write_mode值应为txt,csv,mysql,mongo,请重新输入") diff --git a/writer.py b/writer.py index d4da442c..47d2fab8 100644 --- a/writer.py +++ b/writer.py @@ -1,3 +1,4 @@ +# -*- coding: UTF-8 -*- import csv import os import sys @@ -52,7 +53,7 @@ def write_user(self, user): else: result_header = u'\n\n微博内容: \n' result_header = (u'用户信息\n用户昵称:' + user['nickname'] + u'\n用户id: ' + - str(user['user_id']) + u'\n微博数: ' + + str(user['id']) + u'\n微博数: ' + str(user['weibo_num']) + u'\n关注数: ' + str(user['following']) + u'\n粉丝数: ' + str(user['followers']) + result_header) @@ -100,22 +101,38 @@ def write_user(self, user): if not self.config['filter']: result_headers.insert(-1, '被转发微博原始图片url') result_headers.insert(-1, '是否为原创微博') - with open(get_filepath('csv', self.user['nickname']), - 'a', - encoding='utf-8-sig', - newline='') as f: - csv_writer = csv.writer(f) - csv_writer.writerows([result_headers]) + + if sys.version < '3': # python2.x + reload(sys) + sys.setdefaultencoding('utf-8') + with open(get_filepath('csv', self.user['nickname']), 'ab') as f: + csv_writer = csv.writer(f) + csv_writer.writerows([result_headers]) + else: # python3.x + with open(get_filepath('csv', self.user['nickname']), + 'a', + encoding='utf-8-sig', + newline='') as f: + csv_writer = csv.writer(f) + csv_writer.writerows([result_headers]) def write_weibo(self, weibo): """将爬取的信息写入csv文件""" result_data = [w.values() for w in weibo] - with open(get_filepath('csv', self.user['nickname']), - 'a', - encoding='utf-8-sig', - newline='') as f: - csv_writer = csv.writer(f) - csv_writer.writerows(result_data) + + if sys.version < '3': # python2.x + reload(sys) + sys.setdefaultencoding('utf-8') + with open(get_filepath('csv', self.user['nickname']), 'ab') as f: + csv_writer = csv.writer(f) + csv_writer.writerows(result_data) + else: # python3.x + with open(get_filepath('csv', self.user['nickname']), + 'a', + encoding='utf-8-sig', + newline='') as f: + csv_writer = csv.writer(f) + csv_writer.writerows(result_data) print(u'%d条微博写入csv文件完毕,保存路径:' % len(weibo)) print(get_filepath('csv', self.user['nickname'])) @@ -125,39 +142,41 @@ class MongoWriter: def __init__(self, config): self.config = config - def write_user(self, user): - """将爬取的用户信息写入MongoDB数据库""" - self.user = user - - user_list = [user] - self.info_to_mongodb('user', user_list) - print(u'%s信息写入MongoDB数据库完毕' % user['nickname']) - def info_to_mongodb(self, collection, info_list): """将爬取的信息写入MongoDB数据库""" try: import pymongo + from pymongo import MongoClient except ImportError: sys.exit(u'系统中可能没有安装pymongo库,请先运行 pip install pymongo ,再运行程序') - try: - from pymongo import MongoClient + try: client = MongoClient() - db = client['weibo'] - collection = db[collection] - for info in info_list: - if not collection.find_one({'id': info['id']}): - collection.insert_one(info) - else: - collection.update_one({'id': info['id']}, {'$set': info}) except pymongo.errors.ServerSelectionTimeoutError: sys.exit(u'系统中可能没有安装或启动MongoDB数据库,请先根据系统环境安装或启动MongoDB,再运行程序') - def weibo_to_mongodb(self, weibo): + db = client['weibo'] + collection = db[collection] + for info in info_list: + if not collection.find_one({'id': info['id']}): + collection.insert_one(info) + else: + collection.update_one( + {'id': info['id']}, {'$set': info}) + + def write_user(self, user): + """将爬取的用户信息写入MongoDB数据库""" + self.user = user + + user_list = [user] + self.info_to_mongodb('user', user_list) + print(u'%s信息写入MongoDB数据库完毕' % user['nickname']) + + def write_weibo(self, weibo): """将爬取的微博信息写入MongoDB数据库""" weibo_list = [] for w in weibo: - w['user_id'] = self.user['user_id'] + w['user_id'] = self.user['id'] weibo_list.append(w) self.info_to_mongodb('weibo', weibo_list) print(u'%d条微博写入MongoDB数据库完毕' % len(weibo)) @@ -212,7 +231,7 @@ def write_weibo(self, weibo): # 在'weibo'表中插入或更新微博数据 weibo_list = [] for w in weibo: - w['user_id'] = self.user['user_id'] + w['user_id'] = self.user['id'] weibo_list.append(w) self.mysql_insert('weibo', weibo_list) print(u'%d条微博写入MySQL数据库完毕' % len(weibo)) @@ -227,9 +246,15 @@ def mysql_create(self, connection, sql): def mysql_create_database(self, sql): """创建MySQL数据库""" - import pymysql + try: + import pymysql + except ImportError: + sys.exit(u'系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序') mysql_config = self.config['mysql_config'] - connection = pymysql.connect(**mysql_config) + try: + connection = pymysql.connect(**mysql_config) + except pymysql.err.OperationalError: + sys.exit(u'系统中可能没有安装或启动MySQL数据库或配置错误,请先根据系统环境安装或启动MySQL,再运行程序') self.mysql_create(connection, sql) def mysql_create_table(self, sql): From 224850508d19a9ef8e98256e739029212e86761b Mon Sep 17 00:00:00 2001 From: chenlei Date: Mon, 18 Nov 2019 20:18:46 +0800 Subject: [PATCH 089/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E9=83=A8?= =?UTF-8?q?=E5=88=86=E7=B3=BB=E7=BB=9F=E5=9B=A0=E8=BF=9B=E5=BA=A6=E6=9D=A1?= =?UTF-8?q?=E5=8C=85=E5=90=AB=E4=B8=AD=E6=96=87=E5=87=BA=E9=94=99=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weiboSpider.py b/weiboSpider.py index ab5ccd4f..03dfeb02 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -835,7 +835,7 @@ def get_weibo_info(self): wrote_num = 0 page1 = 0 random_pages = random.randint(1, 5) - for page in tqdm(range(1, page_num + 1), desc=u'进度'): + for page in tqdm(range(1, page_num + 1), desc='Progress'): is_end = self.get_one_page(page) # 获取第page页的全部微博 if is_end: break From 2ba7ee333485bf3a69d524c59834cffe8b148bbf Mon Sep 17 00:00:00 2001 From: chenlei Date: Mon, 18 Nov 2019 20:34:44 +0800 Subject: [PATCH 090/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=9C=A8?= =?UTF-8?q?=E9=83=A8=E5=88=86=E7=B3=BB=E7=BB=9F=E4=B8=AD=E4=B8=8B=E8=BD=BD?= =?UTF-8?q?=E5=9B=BE=E7=89=87=E8=A7=86=E9=A2=91=E6=97=B6=EF=BC=8C=E5=9B=A0?= =?UTF-8?q?=E8=BF=9B=E5=BA=A6=E6=9D=A1=E6=9C=89=E4=B8=AD=E6=96=87=E5=AF=BC?= =?UTF-8?q?=E8=87=B4=E7=A8=8B=E5=BA=8F=E5=87=BA=E9=94=99=E7=9A=84=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weiboSpider.py b/weiboSpider.py index 03dfeb02..1449d74e 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -481,7 +481,7 @@ def download_files(self, type): key = 'video_url' print(u'即将进行%s下载' % describe) file_dir = self.get_filepath(type) - for w in tqdm(self.weibo, desc=u'%s下载进度' % describe): + for w in tqdm(self.weibo, desc='Download progress'): if w[key] != u'无': file_prefix = w['publish_time'][:11].replace( '-', '') + '_' + w['id'] From 76b50ad2ec6c2b9e1f3ba2d0b4a4602db63b235e Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Mon, 18 Nov 2019 22:36:13 +0800 Subject: [PATCH 091/363] Update README.md --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a644220c..1e4daf95 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ * [实例](#实例) * [运行环境](#运行环境) * [使用说明](#使用说明) + * [版本](#0版本) * [下载脚本](#1下载脚本) * [安装依赖](#2安装依赖) * [设置cookie](#3设置cookie) @@ -15,7 +16,7 @@ * [注意事项](#注意事项) ## 功能 -连续爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入文件。写入信息几乎包括了用户微博的所有数据,主要有**用户信息**和**微博信息**两大类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[输出](#输出)部分。具体的写入文件类型如下: +连续爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入**文件**或**数据库**。写入信息几乎包括了用户微博的所有数据,主要有**用户信息**和**微博信息**两大类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[输出](#输出)部分。具体的写入文件类型如下: - 写入**txt文件**(默认) - 写入**csv文件**(默认) - 写入**MySQL数据库**(可选) @@ -109,6 +110,13 @@ txt文件结果如下所示: - 系统: Windows/Linux/macOS ## 使用说明 +### 0.版本 +本程序有两个版本,**功能完成一样**。你现在看到的是单文件版,另一个是多文件版,[多文件版](https://github.com/dataabc/weiboSpider/tree/multi-file)位于multi-file分支。
+二者的区别在于: +>单文件版是所有代码都写到一个文件里,即[weiboSpider.py](https://github.com/dataabc/weiboSpider/blob/master/weiboSpider.py)。多文件版重构了单文件版,按照代码功能分成了几个文件,代码更清晰,更易读。如果你仅仅想使用程序,这两个版本用哪一个都一样;如果你不仅想使用,还想开发新功能,多文件版可能更容易。 + +多文件版由[songzy12](https://github.com/songzy12)重构。songzy12非常认真负责,对于我发现的问题都很耐心地修复了,而且效率非常高,在此感谢。
+本使用说明是单文件版的使用说明,后续会在[multi-file](https://github.com/dataabc/weiboSpider/tree/multi-file)分支的README文档中提供多文件版的使用说明。 ### 1.下载脚本 ```bash $ git clone https://github.com/dataabc/weibospider.git From 943492f2c1f5a28b0db9c81f05db9c1b8b4bba14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Tue, 19 Nov 2019 21:28:58 +0800 Subject: [PATCH 092/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=86=99?= =?UTF-8?q?=E6=97=A5=E5=BF=97=E5=8A=9F=E8=83=BD=EF=BC=8C=E8=AE=B0=E5=BD=95?= =?UTF-8?q?cookie=E6=98=AF=E5=90=A6=E6=9C=89=E6=95=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 如果cookie过期,会将相关信息写入weibo文件夹下的log.txt --- weiboSpider.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/weiboSpider.py b/weiboSpider.py index 1449d74e..a3206967 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -91,6 +91,7 @@ def get_nickname(self): nickname = selector.xpath('//title/text()')[0] nickname = nickname[:-3] if nickname == u'登录 - 新' or nickname == u'新浪': + self.write_log() sys.exit(u'cookie错误或已过期,请按照README中方法重新获取') self.user['nickname'] = nickname except Exception as e: @@ -608,6 +609,17 @@ def get_filepath(self, type): print('Error: ', e) traceback.print_exc() + def write_log(self): + """当程序因cookie过期停止运行时,将相关信息写入log.txt""" + file_dir = os.path.split( + os.path.realpath(__file__))[0] + os.sep + 'weibo' + os.sep + if not os.path.isdir(file_dir): + os.makedirs(file_dir) + file_path = file_dir + 'log.txt' + content = u'cookie已过期,从%s到今天的微博获取失败,请重新设置cookie\n' % self.since_date + with open(file_path, 'ab') as f: + f.write(content.encode(sys.stdout.encoding)) + def write_csv(self, wrote_num): """将爬取的信息写入csv文件""" try: From 49e6e00a37b918b6573c60d0cc5e029d94efe589 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Tue, 19 Nov 2019 21:56:02 +0800 Subject: [PATCH 093/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=9C=A8?= =?UTF-8?q?=E9=83=A8=E5=88=86=E7=B3=BB=E7=BB=9F=E4=B8=AD=E8=BF=9B=E5=BA=A6?= =?UTF-8?q?=E6=9D=A1=E5=8C=85=E5=90=AB=E4=B8=AD=E6=96=87=E5=AF=BC=E8=87=B4?= =?UTF-8?q?=E5=87=BA=E9=94=99=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- downloader.py | 2 +- spider.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/downloader.py b/downloader.py index 87fc1595..b07bab60 100644 --- a/downloader.py +++ b/downloader.py @@ -22,7 +22,7 @@ def download_files(self, file_path, type, weibo): describe = u'视频' key = 'video_url' print(u'即将进行%s下载' % describe) - for w in tqdm(weibo, desc=u'%s下载进度' % describe): + for w in tqdm(weibo, desc='Download progress'): if w[key] != u'无': file_prefix = w['publish_time'][:11].replace( '-', '') + '_' + w['id'] diff --git a/spider.py b/spider.py index fb184a58..b2b71ba9 100644 --- a/spider.py +++ b/spider.py @@ -3,16 +3,16 @@ import random import sys -from datetime import datetime, date, timedelta +from datetime import date, datetime, timedelta from time import sleep from tqdm import tqdm -from validator import Validator -from printer import Printer -from writer import Writer, get_filepath from downloader import Downloader from html_parser import Parser +from printer import Printer +from validator import Validator +from writer import Writer, get_filepath class Spider(object): @@ -105,7 +105,7 @@ def get_weibo_info(self): page_num = self.parser.get_page_num(selector) # 获取微博总页数 page1 = 0 random_pages = random.randint(1, 5) - for page in tqdm(range(1, page_num + 1), desc=u'进度'): + for page in tqdm(range(1, page_num + 1), desc='Progress'): is_end = self.get_one_page(page) # 获取第page页的全部微博 if is_end: break From 077528c84871e042157191c5bc053d34e65dc018 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Wed, 20 Nov 2019 01:56:21 +0800 Subject: [PATCH 094/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=86=99?= =?UTF-8?q?=E6=97=A5=E5=BF=97=E5=8A=9F=E8=83=BD=EF=BC=8C=E6=A3=80=E6=B5=8B?= =?UTF-8?q?cookie=E6=98=AF=E5=90=A6=E6=9C=89=E6=95=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 若cookie过期,将相关信息写入weibo文件夹下的log.txt --- spider.py | 3 ++- writer.py | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/spider.py b/spider.py index b2b71ba9..401abf93 100644 --- a/spider.py +++ b/spider.py @@ -12,7 +12,7 @@ from html_parser import Parser from printer import Printer from validator import Validator -from writer import Writer, get_filepath +from writer import Writer, get_filepath, write_log class Spider(object): @@ -51,6 +51,7 @@ def get_nickname(self): nickname = selector.xpath('//title/text()')[0] nickname = nickname[:-3] if nickname == u'登录 - 新' or nickname == u'新浪': + write_log(self.config['since_date']) sys.exit(u'cookie错误或已过期,请按照README中方法重新获取') self.user['nickname'] = nickname diff --git a/writer.py b/writer.py index 47d2fab8..b688b5bf 100644 --- a/writer.py +++ b/writer.py @@ -19,6 +19,18 @@ def get_filepath(type, nickname): return file_path +def write_log(since_date): + """当程序因cookie过期停止运行时,将相关信息写入log.txt""" + file_dir = os.path.split( + os.path.realpath(__file__))[0] + os.sep + 'weibo' + os.sep + if not os.path.isdir(file_dir): + os.makedirs(file_dir) + file_path = file_dir + 'log.txt' + content = u'cookie已过期,从%s到今天的微博获取失败,请重新设置cookie\n' % since_date + with open(file_path, 'ab') as f: + f.write(content.encode(sys.stdout.encoding)) + + class Writer: def __init__(self, config): write_mode = config['write_mode'] @@ -161,8 +173,7 @@ def info_to_mongodb(self, collection, info_list): if not collection.find_one({'id': info['id']}): collection.insert_one(info) else: - collection.update_one( - {'id': info['id']}, {'$set': info}) + collection.update_one({'id': info['id']}, {'$set': info}) def write_user(self, user): """将爬取的用户信息写入MongoDB数据库""" From 7cf5b75fb4064406412cdb8523ccbb5144ae0304 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Mon, 25 Nov 2019 21:36:46 +0800 Subject: [PATCH 095/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E9=85=8D?= =?UTF-8?q?=E7=BD=AE=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.json | 16 +++++ weiboSpider.py | 164 ++++++++++++++++++++++--------------------------- 2 files changed, 91 insertions(+), 89 deletions(-) create mode 100644 config.json diff --git a/config.json b/config.json new file mode 100644 index 00000000..6a2bca8f --- /dev/null +++ b/config.json @@ -0,0 +1,16 @@ +{ + "user_id_list": ["1669879400"], + "filter": 1, + "since_date": "2018-01-01", + "write_mode": ["csv", "txt"], + "pic_download": 1, + "video_download": 1, + "cookie": "your cookie", + "mysql_config": { + "host": "localhost", + "port": 3306, + "user": "root", + "password": "123456", + "charset": "utf8mb4" + } +} \ No newline at end of file diff --git a/weiboSpider.py b/weiboSpider.py index a3206967..e8b63c5a 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -3,13 +3,14 @@ import codecs import csv +import json import os import random import re import sys import traceback from collections import OrderedDict -from datetime import datetime, timedelta +from datetime import date, datetime, timedelta from time import sleep import requests @@ -19,41 +20,66 @@ class Weibo(object): - cookie = {'Cookie': 'your cookie'} # 将your cookie替换成自己的cookie - - def __init__(self, - filter=0, - since_date='1900-01-01', - mongodb_write=0, - mysql_write=0, - pic_download=0, - video_download=0): + def __init__(self, config): """Weibo类初始化""" - if filter != 0 and filter != 1: - sys.exit(u'filter值应为0或1,请重新输入') - if not self.is_date(since_date): - sys.exit(u'since_date值应为yyyy-mm-dd形式,请重新输入') - if mongodb_write != 0 and mongodb_write != 1: - sys.exit(u'mongodb_write值应为0或1,请重新输入') - if mysql_write != 0 and mysql_write != 1: - sys.exit(u'mysql_write值应为0或1,请重新输入') - if pic_download != 0 and pic_download != 1: - sys.exit(u'pic_download值应为0或1,请重新输入') - if video_download != 0 and video_download != 1: - sys.exit(u'video_download值应为0或1,请重新输入') - self.user_id = '' # 用户id,如昵称为"Dear-迪丽热巴"的id为'1669879400' - self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 + self.validate_config(config) + self.filter = config[ + 'filter'] # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 + since_date = str(config['since_date']) + if since_date.isdigit(): + since_date = str(date.today() - timedelta(int(since_date))) self.since_date = since_date # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd - self.mongodb_write = mongodb_write # 值为0代表不将结果写入MongoDB数据库,1代表写入 - self.mysql_write = mysql_write # 值为0代表不将结果写入MySQL数据库,1代表写入 - self.pic_download = pic_download # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 - self.video_download = video_download # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 - self.got_num = 0 # 爬取到的微博数 - self.weibo = [] # 存储爬取到的所有微博信息 + self.write_mode = config[ + 'write_mode'] # 结果信息保存类型,为list形式,可包含txt、csv、mongo和mysql四种类型 + self.pic_download = config[ + 'pic_download'] # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 + self.video_download = config[ + 'video_download'] # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 + self.cookie = {'Cookie': config['cookie']} + self.mysql_config = config.get('mysql_config') # MySQL数据库连接配置,可以不填 + user_id_list = config['user_id_list'] + if not isinstance(user_id_list, list): + user_id_list = self.get_user_list(user_id_list) + self.user_id_list = user_id_list # 要爬取的微博用户的user_id列表 + self.user_id = '' # 用户id,如昵称为"Dear-迪丽热巴"的id为'1669879400' self.user = {} # 存储爬取到的用户信息 + self.got_num = 0 # 存储爬取到的微博数 + self.weibo = [] # 存储爬取到的所有微博信息 self.weibo_id_list = [] # 存储爬取到的所有微博id - self.mysql_config = { - } # MySQL数据库连接配置,可以不填,当使用者的mysql用户名、密码等与本程序默认值不同时,需要通过mysql_config来自定义 + + def validate_config(self, config): + """验证配置是否正确""" + + # 验证filter、pic_download、video_download + argument_lsit = ['filter', 'pic_download', 'video_download'] + for argument in argument_lsit: + if config[argument] != 0 and config[argument] != 1: + sys.exit(u'%s值应为0或1,请重新输入' % config[argument]) + + # 验证since_date + since_date = str(config['since_date']) + if (not self.is_date(since_date)) and (not since_date.isdigit()): + sys.exit(u'since_date值应为yyyy-mm-dd形式或整数,请重新输入') + + # 验证write_mode + write_mode = ['txt', 'csv', 'mongo', 'mysql'] + if not isinstance(config['write_mode'], list): + sys.exit(u'write_mode值应为list类型') + for mode in config['write_mode']: + if mode not in write_mode: + sys.exit(u'%s为无效模式,请从txt、csv、mongo和mysql挑选一个或多个作为write_mode' % + mode) + + # 验证user_id_list + user_id_list = config['user_id_list'] + if (not isinstance(user_id_list, + list)) and (not user_id_list.endswith('.txt')): + sys.exit(u'user_id_list值应为list类型或txt文件路径') + if not isinstance(user_id_list, list): + if not os.path.isfile(user_id_list): + sys.exit( + u'当前路径:%s 不存在user_id_list.txt文件' % + (os.path.split(os.path.realpath(__file__))[0] + os.sep)) def is_date(self, since_date): """判断日期格式是否正确""" @@ -133,9 +159,9 @@ def user_to_mysql(self): def user_to_database(self): """将用户信息写入数据库""" - if self.mysql_write: + if 'mysql' in self.write_mode: self.user_to_mysql() - if self.mongodb_write: + if 'mongo' in self.write_mode: self.user_to_mongodb() def print_user_info(self): @@ -723,10 +749,6 @@ def weibo_to_mongodb(self, wrote_num): self.info_to_mongodb('weibo', weibo_list) print(u'%d条微博写入MongoDB数据库完毕' % self.got_num) - def change_mysql_config(self, mysql_config): - """修改MySQL数据库连接配置""" - self.mysql_config = mysql_config - def mysql_create(self, connection, sql): """创建MySQL数据库或表""" try: @@ -830,11 +852,13 @@ def weibo_to_mysql(self, wrote_num): def write_data(self, wrote_num): """将爬取到的信息写入文件或数据库""" if self.got_num > wrote_num: - self.write_csv(wrote_num) - self.write_txt(wrote_num) - if self.mysql_write: + if 'csv' in self.write_mode: + self.write_csv(wrote_num) + if 'txt' in self.write_mode: + self.write_txt(wrote_num) + if 'mysql' in self.write_mode: self.weibo_to_mysql(wrote_num) - if self.mongodb_write: + if 'mongo' in self.write_mode: self.weibo_to_mongodb(wrote_num) def get_weibo_info(self): @@ -892,10 +916,10 @@ def initialize_info(self, user_id): self.user_id = user_id self.weibo_id_list = [] - def start(self, user_id_list): + def start(self): """运行爬虫""" try: - for user_id in user_id_list: + for user_id in self.user_id_list: self.initialize_info(user_id) print('*' * 100) self.get_weibo_info() @@ -912,51 +936,13 @@ def start(self, user_id_list): def main(): try: - # 以下是程序配置信息,可以根据自己需求修改 - filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 - since_date = '2018-01-01' # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd - """值为0代表不将结果写入MongoDB数据库,1代表写入;若要写入MongoDB数据库, - 请先安装MongoDB数据库和pymongo,pymongo安装方法为命令行运行:pip install pymongo""" - mongodb_write = 0 - """值为0代表不将结果写入MySQL数据库,1代表写入;若要写入MySQL数据库, - 请先安装MySQL数据库和pymysql,pymysql安装方法为命令行运行:pip install pymysql""" - mysql_write = 0 - pic_download = 1 # 值为0代表不下载微博原始图片,1代表下载微博原始图片 - video_download = 1 # 值为0代表不下载微博视频,1代表下载微博视频 - - wb = Weibo(filter, since_date, mongodb_write, mysql_write, - pic_download, video_download) - - # 下面是自定义MySQL数据库连接配置(可选) - """因为操作MySQL数据库需要用户名、密码等参数,本程序默认为: - mysql_config = { - 'host': 'localhost', - 'port': 3306, - 'user': 'root', - 'password': '123456', - 'charset': 'utf8mb4' - } - 大家的参数配置如果和默认值不同,可以将上面的参数值替换成自己的, - 然后添加如下代码,使修改生效,如果你的参数和默认值相同则不需要下面的代码: - wb.change_mysql_config(mysql_config)""" - - # 下面是配置user_id_list - """user_id_list包含了要爬的目标微博id,可以是一个,也可以是多个,也可以从文件中读取 - 爬单个微博,user_id_list如下所示,可以改成任意合法的用户id - user_id_list = ['1669879400'] - 爬多个微博,user_id_list如下所示,可以改成任意合法的用户id - user_id_list = ['1669879400', '1729370543'] - 也可以在文件中读取user_id_list,文件中可以包含很多user_id, - 每个user_id占一行,也可以在user_id后面加注释,如用户昵称,user_id和注释之间必需要有空格, - 文件名任意,类型为txt,位置位于本程序的同目录下,文件内容可以为如下形式: - 1223178222 胡歌 - 1669879400 迪丽热巴 - 1729370543 郭碧婷 - 比如文件可以叫user_id_list.txt,读取文件中的user_id_list如下所示: - user_id_list = wb.get_user_list('user_id_list.txt')""" - user_id_list = ['1669879400'] - - wb.start(user_id_list) # 爬取微博信息 + if not os.path.isfile('./config.json'): + sys.exit(u'当前路径:%s 不存在配置文件config.json' % + (os.path.split(os.path.realpath(__file__))[0] + os.sep)) + with open('./config.json') as f: + config = json.loads(f.read()) + wb = Weibo(config) + wb.start() # 爬取微博信息 except Exception as e: print('Error: ', e) traceback.print_exc() From 69cfa4cd721d3473f9804d45ef335095d7f5e503 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 26 Nov 2019 02:58:51 +0800 Subject: [PATCH 096/363] Update README.md --- README.md | 213 ++++++++++++++++++++++++------------------------------ 1 file changed, 96 insertions(+), 117 deletions(-) diff --git a/README.md b/README.md index 1e4daf95..4c5fc536 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,10 @@ * [版本](#0版本) * [下载脚本](#1下载脚本) * [安装依赖](#2安装依赖) - * [设置cookie](#3设置cookie) - * [设置user_id](#4设置user_id) - * [设置数据库(可选)](#5设置数据库可选) - * [运行脚本](#6运行脚本) - * [按需求修改脚本(可选)](#7按需求修改脚本可选) + * [程序设置](#3程序设置) + * [设置数据库(可选)](#4设置数据库可选) + * [运行脚本](#5运行脚本) + * [按需求修改脚本(可选)](#6按需求修改脚本可选) * [如何获取cookie](#如何获取cookie) * [如何获取user_id](#如何获取user_id) * [注意事项](#注意事项) @@ -65,31 +64,27 @@
## 实例 -以爬取迪丽热巴的微博为例。首先,我们需要为程序设置cookie值,cookie获取及设置方法见[设置cookie](#3设置cookie)。迪丽热巴的微博昵称为"Dear-迪丽热巴",id为1669879400,用户id获取方法见[如何获取user_id](#如何获取user_id)。我们选择爬取她的全部原创微博。具体方法是将**weibospider.py**文件的main函数主要部分修改为如下代码: -```python - # 以下是程序配置信息,可以根据自己需求修改 - filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 - since_date = '1900-01-01' # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd - """值为0代表不将结果写入MongoDB数据库,1代表写入;若要写入MongoDB数据库, - 请先安装MongoDB数据库和pymongo,pymongo安装方法为命令行运行:pip install pymongo""" - mongodb_write = 0 - """值为0代表不将结果写入MySQL数据库,1代表写入;若要写入MySQL数据库, - 请先安装MySQL数据库和pymysql,pymysql安装方法为命令行运行:pip install pymysql""" - mysql_write = 0 - pic_download = 1 # 值为0代表不下载微博原始图片,1代表下载微博原始图片 - video_download = 1 # 值为0代表不下载微博视频,1代表下载微博视频 +以爬取迪丽热巴的微博为例,我们需要修改**config.json**文件,文件内容如下: +``` +{ + "user_id_list": ["1669879400"], + "filter": 1, + "since_date": "1900-01-01", + "write_mode": ["csv", "txt"], + "pic_download": 1, + "video_download": 1, + "cookie": "your cookie" +} +``` - wb = Weibo(filter, since_date, mongodb_write, mysql_write, - pic_download, video_download) - user_id_list = ['1669879400'] +对于上述参数的含义以及取值范围,这里仅作简单介绍,详细信息见[程序设置](#程序设置)。 +>**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](#如何获取user_id);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件和csv文件,所以它的值为["csv", "txt"],如果你想写入数据库,具体设置见[设置数据库](#4设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[如何获取cookie](#如何获取cookie),获取cookie后把"your cookie"替换成真实的cookie值即可。
- wb.start(user_id_list) # 爬取微博信息 -``` -代码具体含义注释里都有,不在赘述。设置完成后运行程序: +cookie修改完成后运行程序: ```bash $ python weibospider.py ``` -程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](#5设置数据库可选)部分。
+程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](#4设置数据库可选)部分。

csv文件结果如下所示: ![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv*
@@ -104,7 +99,7 @@ txt文件结果如下所示: 下载的视频如下所示: ![](https://picture.cognize.me/cognize/github/weibospider/video.png)*video文件夹*
本次下载了70个视频,是她原创微博中的视频,视频名为yyyymmdd+微博id的形式。其中有一个视频因为网络原因下载失败,程序将它的微博id和视频url以“weibo_id:video_url”的形式写到了同文件夹下的not_downloaded.txt里。
-因为我本地没有安装MySQL数据库和MongoDB数据库,所以暂时设置成不写入数据库。如果你想要将爬取结果写入数据库,只需要先安装数据库(MySQL或MongoDB),再安装对应包(pymysql或pymongo),然后将mysql_write或mongodb_write值设置为1即可。写入MySQL需要用户名、密码等配置信息,这些配置如何设置见[设置数据库](#5设置数据库可选)部分。 +因为我本地没有安装MySQL数据库和MongoDB数据库,所以暂时设置成不写入数据库。如果你想要将爬取结果写入数据库,只需要先安装数据库(MySQL或MongoDB),再安装对应包(pymysql或pymongo),然后将mysql_write或mongodb_write值设置为1即可。写入MySQL需要用户名、密码等配置信息,这些配置如何设置见[设置数据库](#4设置数据库可选)部分。 ## 运行环境 - 开发语言:python2/python3 - 系统: Windows/Linux/macOS @@ -126,59 +121,91 @@ $ git clone https://github.com/dataabc/weibospider.git ```bash $ pip install -r requirements.txt ``` -### 3.设置cookie -打开weibospider文件夹下的**weibospider.py**文件,将"**your cookie**"替换成爬虫微博的cookie,具体替换位置大约在**weibospider.py**文件的22行左右。cookie获取方法见[如何获取cookie](#如何获取cookie); -### 4.设置user_id -打开weibospider文件夹下的**weibospider.py**文件,将我们想要爬取的**一个**或**多个**微博的user_id赋值给user_id_list,user_id获取方法见[如何获取user_id](#如何获取user_id)。user_id设置代码位于**weibospider.py**的main函数里,具体代码如下: -```python -# 爬单个微博用户,可以改成任意合法的用户id -user_id_list = ['1669879400'] +### 3.程序设置 +打开**config.json**文件,你会看到如下内容: +``` +{ + "user_id_list": ["1669879400"], + "filter": 1, + "since_date": "2018-01-01", + "write_mode": ["csv", "txt"], + "pic_download": 1, + "video_download": 1, + "cookie": "your cookie", + "mysql_config": { + "host": "localhost", + "port": 3306, + "user": "root", + "password": "123456", + "charset": "utf8mb4" + } +} ``` -或者 -```python -# 爬多个微博用户,可以改成任意合法的用户id -user_id_list = ['1223178222', '1669879400', '1729370543'] +下面讲解每个参数的含义与设置方法。
+**设置user_id_list**
+user_id_list是我们要爬取的微博的id,可以是一个,也可以是多个,例如: ``` -也可以读取文件中的用户id,每个user_id占一行,也可以在user_id后面加注释(可选),如用户昵称等信息,user_id和注释之间必需要有空格,文件名任意,类型为txt,位置位于本程序的同目录下,文件内容示例如下: +"user_id_list": ["1223178222", "1669879400", "1729370543"], +``` +上述代码代表我们要连续爬取user_id分别为“1223178222”、 “1669879400”、 “1729370543”的三个用户的微博,具体如何获取user_id见[如何获取user_id](#如何获取user_id)。
+user_id_list的值也可以是文件路径,我们可以把要爬的所有微博用户的user_id都写到txt文件里,然后把文件的位置路径赋值给user_id_list。
+在txt文件中,每个user_id占一行,也可以在user_id后面加注释(可选),如用户昵称等信息,user_id和注释之间必需要有空格,文件名任意,类型为txt,位置位于本程序的同目录下,文件内容示例如下: ``` 1223178222 胡歌 1669879400 迪丽热巴 1729370543 郭碧婷 ``` -假如文件叫user_id_list.txt,则user_id设置代码为: -```python -user_id_list = wb.get_user_list('user_id_list.txt') +假如文件叫user_id_list.txt,则user_id_list设置代码为: +``` +"user_id_list": "user_id_list.txt", +``` +**设置filter**
+filter控制爬取范围,值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发)。例如,如果要爬全部原创微博,请使用如下代码: +``` +"filter": 1, +``` +**设置since_date**
+since_date值可以是日期,也可以是整数。如果是日期,代表爬取该日期之后的微博,格式应为“yyyy-mm-dd”,如: +``` +"since_date": "2018-01-01", +``` +代表爬取从2018年1月1日到现在的微博。
+如果是整数,代表爬取最近n天的微博,如: +``` +"since_date": 10, +``` +代表爬取最近10天的微博,这个说法不是特别准确,准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。
+**设置write_mode**
+write_mode控制结果文件格式,取值范围是csv、txt、mongo和mysql,分别代表将结果文件写入csv、txt、MongoDB和MySQL数据库。write_mode可以同时包含这些取值中的一个或几个,如: ``` -### 5.设置数据库(可选) +"write_mode": ["csv", "txt"], +``` +代表将结果信息写入csv文件和txt文件。特别注意,如果你想写入数据库,除了在write_mode添加对应数据库的名字外,还应该安装相关数据库和对应python模块,具体操作见[设置数据库](#4设置数据库可选)部分。
+**设置pic_download**
+pic_download控制是否下载微博中的图片,值为1代表下载,值为0代表不下载,如 +``` +"pic_download": 1, +``` +代表下载微博中的图片。
+**设置video_download**
+video_download控制是否下载微博中的视频,值为1代表下载,值为0代表不下载,如 +``` +"video_download": 1, +``` +**设置cookie**
+请按照[如何获取cookie](#如何获取cookie),获取cookie,然后将“your cookie”替换成真实的cookie值。
+**设置mysql_config(可选)**
+mysql_config控制mysql参数配置。如果你不需要将结果信息写入mysql,这个参数可以忽略,即删除或保留都无所谓;如果你需要写入mysql且config.json文件中mysql_config的配置与你的mysql配置不一样,请将该值改成你自己mysql中的参数配置。 + +### 4.设置数据库(可选) 本部分是可选部分,如果不需要将爬取信息写入数据库,可跳过这一步。本程序目前支持MySQL数据库和MongoDB数据库,如果你需要写入其它数据库,可以参考这两个数据库的写法自己编写。
**MySQL数据库写入**
-要想将爬取信息写入MySQL,请将main函数中的mysql_write变量值改为1。再根据自己的系统环境安装MySQL,然后命令行执行: +要想将爬取信息写入MySQL,请据自己的系统环境安装MySQL,然后命令行执行: ```bash $ pip install pymysql ``` -MySQL写入需要主机、端口号、用户名、密码等配置,本程序默认的配置如下: -```python - mysql_config = { - 'host': 'localhost', - 'port': 3306, - 'user': 'root', - 'password': '123456', - 'charset': 'utf8mb4' - } -``` -如果你的配置和上面不同,需要修改main函数,将本程序的配置改成自己的配置,具体代码如下: -```python - mysql_config = { - 'host': 'xxx', - 'port': xxx, - 'user': 'xxx', - 'password': 'xxx', - 'charset': 'utf8mb4' - } - wb.change_mysql_config(mysql_config) -``` **MongoDB数据库写入**
-要想将爬取信息写入MongoDB,请将main函数中的mongodb_write变量值改为1。再根据自己的系统环境安装MongoDB,然后命令行执行: +要想将爬取信息写入MongoDB,请根据自己的系统环境安装MongoDB,然后命令行执行: ```bash $ pip install pymongo ``` @@ -208,63 +235,15 @@ MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为
-### 6.运行脚本 +### 5.运行脚本 大家可以根据自己的运行环境选择运行方式,Linux可以通过 ```bash $ python weibospider.py ``` 运行; -### 7.按需求修改脚本(可选) -本程序是一个Weibo类,用户可以按照自己的需求调用Weibo类。 -用户可以直接在**weibospider.py**文件中调用Weibo类,具体调用代码示例如下: -```python - # 以下是程序配置信息,可以根据自己需求修改 - filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 - since_date = '2019-07-01' # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd - """值为0代表不将结果写入MongoDB数据库,1代表写入;若要写入MongoDB数据库, - 请先安装MongoDB数据库和pymongo,pymongo安装方法为命令行运行:pip install pymongo""" - mongodb_write = 1 - """值为0代表不将结果写入MySQL数据库,1代表写入;若要写入MySQL数据库, - 请先安装MySQL数据库和pymysql,pymysql安装方法为命令行运行:pip install pymysql""" - mysql_write = 1 - pic_download = 0 # 值为0代表不下载微博原始图片,1代表下载微博原始图片 - video_download = 0 # 值为0代表不下载微博视频,1代表下载微博视频 - - wb = Weibo(filter, since_date, mongodb_write, mysql_write, - pic_download, video_download) - - # 下面是自定义MySQL数据库连接配置(可选) - """因为操作MySQL数据库需要用户名、密码等参数,本程序默认为: - mysql_config = { - 'host': 'localhost', - 'port': 3306, - 'user': 'root', - 'password': '123456', - 'charset': 'utf8mb4' - } - 大家的参数配置如果和默认值不同,可以将上面的参数值替换成自己的, - 然后添加如下代码,使修改生效,如果你的参数和默认值相同则不需要下面的代码: - wb.change_mysql_config(mysql_config)""" - - # 下面是配置user_id_list - """user_id_list包含了要爬的目标微博id,可以是一个,也可以是多个,也可以从文件中读取 - 爬单个微博,user_id_list如下所示,可以改成任意合法的用户id - user_id_list = ['1669879400'] - 爬多个微博,user_id_list如下所示,可以改成任意合法的用户id - user_id_list = ['1223178222', '1669879400', '1729370543'] - 也可以在文件中读取user_id_list,文件中可以包含很多user_id, - 每个user_id占一行,也可以在user_id后面加注释,如用户昵称,user_id和注释之间必需要有空格, - 文件名任意,类型为txt,位置位于本程序的同目录下,文件内容可以为如下形式: - 1223178222 胡歌 - 1669879400 迪丽热巴 - 1729370543 郭碧婷 - 比如文件可以叫user_id_list.txt,读取文件中的user_id_list如下所示: - user_id_list = wb.get_user_list('user_id_list.txt')""" - user_id_list = ['1223178222', '1669879400', '1729370543'] - - wb.start(user_id_list) # 爬取微博信息 -``` -通过执行wb.start() 完成了微博的爬取工作。在上述代码执行后,我们可以得到很多信息:
+### 6.按需求修改脚本(可选) +本程序是一个Weibo类,用户可以按照自己的需求调用或修改。
+通过执行本程序,我们可以得到很多信息:
**wb.nickname**:用户昵称;
**wb.weibo_num**:微博数;
**wb.following**:关注数;
From f7458bd3c9f1cafd8219382838f846799b02785f Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 26 Nov 2019 12:46:08 +0800 Subject: [PATCH 097/363] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4c5fc536..9f9fcee9 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ 以爬取迪丽热巴的微博为例,我们需要修改**config.json**文件,文件内容如下: ``` { - "user_id_list": ["1669879400"], + "user_id_list": ["1669879400"], "filter": 1, "since_date": "1900-01-01", "write_mode": ["csv", "txt"], @@ -77,7 +77,7 @@ } ``` -对于上述参数的含义以及取值范围,这里仅作简单介绍,详细信息见[程序设置](#程序设置)。 +对于上述参数的含义以及取值范围,这里仅作简单介绍,详细信息见[程序设置](#3程序设置)。 >**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](#如何获取user_id);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件和csv文件,所以它的值为["csv", "txt"],如果你想写入数据库,具体设置见[设置数据库](#4设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[如何获取cookie](#如何获取cookie),获取cookie后把"your cookie"替换成真实的cookie值即可。
cookie修改完成后运行程序: @@ -125,7 +125,7 @@ $ pip install -r requirements.txt 打开**config.json**文件,你会看到如下内容: ``` { - "user_id_list": ["1669879400"], + "user_id_list": ["1669879400"], "filter": 1, "since_date": "2018-01-01", "write_mode": ["csv", "txt"], From 94eceb21492fd28b2fa9ac2bed0cef0bcbfd295f Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 26 Nov 2019 20:18:35 +0800 Subject: [PATCH 098/363] Update README.md --- README.md | 237 ++++++++++++++++++++++++++---------------------------- 1 file changed, 116 insertions(+), 121 deletions(-) diff --git a/README.md b/README.md index a644220c..76bf5eaf 100644 --- a/README.md +++ b/README.md @@ -3,19 +3,19 @@ * [实例](#实例) * [运行环境](#运行环境) * [使用说明](#使用说明) + * [版本](#0版本) * [下载脚本](#1下载脚本) * [安装依赖](#2安装依赖) - * [设置cookie](#3设置cookie) - * [设置user_id](#4设置user_id) - * [设置数据库(可选)](#5设置数据库可选) - * [运行脚本](#6运行脚本) - * [按需求修改脚本(可选)](#7按需求修改脚本可选) + * [程序设置](#3程序设置) + * [设置数据库(可选)](#4设置数据库可选) + * [运行脚本](#5运行脚本) + * [按需求修改脚本(可选)](#6按需求修改脚本可选) * [如何获取cookie](#如何获取cookie) * [如何获取user_id](#如何获取user_id) * [注意事项](#注意事项) ## 功能 -连续爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入文件。写入信息几乎包括了用户微博的所有数据,主要有**用户信息**和**微博信息**两大类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[输出](#输出)部分。具体的写入文件类型如下: +连续爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入**文件**或**数据库**。写入信息几乎包括了用户微博的所有数据,主要有**用户信息**和**微博信息**两大类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[输出](#输出)部分。具体的写入文件类型如下: - 写入**txt文件**(默认) - 写入**csv文件**(默认) - 写入**MySQL数据库**(可选) @@ -64,31 +64,27 @@
## 实例 -以爬取迪丽热巴的微博为例。首先,我们需要为程序设置cookie值,cookie获取及设置方法见[设置cookie](#3设置cookie)。迪丽热巴的微博昵称为"Dear-迪丽热巴",id为1669879400,用户id获取方法见[如何获取user_id](#如何获取user_id)。我们选择爬取她的全部原创微博。具体方法是将**weibospider.py**文件的main函数主要部分修改为如下代码: -```python - # 以下是程序配置信息,可以根据自己需求修改 - filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 - since_date = '1900-01-01' # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd - """值为0代表不将结果写入MongoDB数据库,1代表写入;若要写入MongoDB数据库, - 请先安装MongoDB数据库和pymongo,pymongo安装方法为命令行运行:pip install pymongo""" - mongodb_write = 0 - """值为0代表不将结果写入MySQL数据库,1代表写入;若要写入MySQL数据库, - 请先安装MySQL数据库和pymysql,pymysql安装方法为命令行运行:pip install pymysql""" - mysql_write = 0 - pic_download = 1 # 值为0代表不下载微博原始图片,1代表下载微博原始图片 - video_download = 1 # 值为0代表不下载微博视频,1代表下载微博视频 +以爬取迪丽热巴的微博为例,我们需要修改**config.json**文件,文件内容如下: +``` +{ + "user_id_list": ["1669879400"], + "filter": 1, + "since_date": "1900-01-01", + "write_mode": ["csv", "txt"], + "pic_download": 1, + "video_download": 1, + "cookie": "your cookie" +} +``` - wb = Weibo(filter, since_date, mongodb_write, mysql_write, - pic_download, video_download) - user_id_list = ['1669879400'] +对于上述参数的含义以及取值范围,这里仅作简单介绍,详细信息见[程序设置](#3程序设置)。 +>**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](#如何获取user_id);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件和csv文件,所以它的值为["csv", "txt"],如果你想写入数据库,具体设置见[设置数据库](#4设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[如何获取cookie](#如何获取cookie),获取cookie后把"your cookie"替换成真实的cookie值即可。
- wb.start(user_id_list) # 爬取微博信息 -``` -代码具体含义注释里都有,不在赘述。设置完成后运行程序: +cookie修改完成后运行程序: ```bash -$ python weibospider.py +$ python spider.py ``` -程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](#5设置数据库可选)部分。
+程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](#4设置数据库可选)部分。

csv文件结果如下所示: ![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv*
@@ -103,74 +99,121 @@ txt文件结果如下所示: 下载的视频如下所示: ![](https://picture.cognize.me/cognize/github/weibospider/video.png)*video文件夹*
本次下载了70个视频,是她原创微博中的视频,视频名为yyyymmdd+微博id的形式。其中有一个视频因为网络原因下载失败,程序将它的微博id和视频url以“weibo_id:video_url”的形式写到了同文件夹下的not_downloaded.txt里。
-因为我本地没有安装MySQL数据库和MongoDB数据库,所以暂时设置成不写入数据库。如果你想要将爬取结果写入数据库,只需要先安装数据库(MySQL或MongoDB),再安装对应包(pymysql或pymongo),然后将mysql_write或mongodb_write值设置为1即可。写入MySQL需要用户名、密码等配置信息,这些配置如何设置见[设置数据库](#5设置数据库可选)部分。 +因为我本地没有安装MySQL数据库和MongoDB数据库,所以暂时设置成不写入数据库。如果你想要将爬取结果写入数据库,只需要先安装数据库(MySQL或MongoDB),再安装对应包(pymysql或pymongo),然后将mysql_write或mongodb_write值设置为1即可。写入MySQL需要用户名、密码等配置信息,这些配置如何设置见[设置数据库](#4设置数据库可选)部分。 ## 运行环境 - 开发语言:python2/python3 - 系统: Windows/Linux/macOS ## 使用说明 +### 0.版本 +本程序有两个版本,**功能完成一样**。你现在看到的是多文件版,另一个是单文件版,[单文件版](https://github.com/dataabc/weiboSpider)位于master分支。
+二者的区别在于: +>单文件版是所有代码都写到一个文件里,即[weiboSpider.py](https://github.com/dataabc/weiboSpider/blob/master/weiboSpider.py)。多文件版重构了单文件版,按照代码功能分成了几个文件,代码更清晰,更易读。如果你仅仅想使用程序,这两个版本用哪一个都一样;如果你不仅想使用,还想开发新功能,多文件版可能更容易。 + +多文件版由[songzy12](https://github.com/songzy12)重构。songzy12非常认真负责,对于我发现的问题都很耐心地修复了,而且效率非常高,在此感谢。
+本使用说明是多文件版的使用说明。 +**本版本包含文件及说明如下** +* config.json: 配置相关文件; +* spider.py: 爬虫逻辑主文件; +* printer.py: 打印调试信息相关; +* writer.py: 保存至本地文件或数据库相关; +* validator.py: 配置参数验证相关; +* html_parser.py: 网页解析相关; +* downloader.py: 图片、视频下载相关。 ### 1.下载脚本 ```bash -$ git clone https://github.com/dataabc/weibospider.git +$ git clone -b multi-file https://github.com/dataabc/weibospider.git ``` 运行上述命令,将本项目下载到当前目录,如果下载成功当前目录会出现一个名为"weibospider"的文件夹; ### 2.安装依赖 ```bash $ pip install -r requirements.txt ``` -### 3.设置cookie -打开weibospider文件夹下的**weibospider.py**文件,将"**your cookie**"替换成爬虫微博的cookie,具体替换位置大约在**weibospider.py**文件的22行左右。cookie获取方法见[如何获取cookie](#如何获取cookie); -### 4.设置user_id -打开weibospider文件夹下的**weibospider.py**文件,将我们想要爬取的**一个**或**多个**微博的user_id赋值给user_id_list,user_id获取方法见[如何获取user_id](#如何获取user_id)。user_id设置代码位于**weibospider.py**的main函数里,具体代码如下: -```python -# 爬单个微博用户,可以改成任意合法的用户id -user_id_list = ['1669879400'] +### 3.程序设置 +打开**config.json**文件,你会看到如下内容: ``` -或者 -```python -# 爬多个微博用户,可以改成任意合法的用户id -user_id_list = ['1223178222', '1669879400', '1729370543'] +{ + "user_id_list": ["1669879400"], + "filter": 1, + "since_date": "2018-01-01", + "write_mode": ["csv", "txt"], + "pic_download": 1, + "video_download": 1, + "cookie": "your cookie", + "mysql_config": { + "host": "localhost", + "port": 3306, + "user": "root", + "password": "123456", + "charset": "utf8mb4" + } +} ``` -也可以读取文件中的用户id,每个user_id占一行,也可以在user_id后面加注释(可选),如用户昵称等信息,user_id和注释之间必需要有空格,文件名任意,类型为txt,位置位于本程序的同目录下,文件内容示例如下: +下面讲解每个参数的含义与设置方法。
+**设置user_id_list**
+user_id_list是我们要爬取的微博的id,可以是一个,也可以是多个,例如: +``` +"user_id_list": ["1223178222", "1669879400", "1729370543"], +``` +上述代码代表我们要连续爬取user_id分别为“1223178222”、 “1669879400”、 “1729370543”的三个用户的微博,具体如何获取user_id见[如何获取user_id](#如何获取user_id)。
+user_id_list的值也可以是文件路径,我们可以把要爬的所有微博用户的user_id都写到txt文件里,然后把文件的位置路径赋值给user_id_list。
+在txt文件中,每个user_id占一行,也可以在user_id后面加注释(可选),如用户昵称等信息,user_id和注释之间必需要有空格,文件名任意,类型为txt,位置位于本程序的同目录下,文件内容示例如下: ``` 1223178222 胡歌 1669879400 迪丽热巴 1729370543 郭碧婷 ``` -假如文件叫user_id_list.txt,则user_id设置代码为: -```python -user_id_list = wb.get_user_list('user_id_list.txt') +假如文件叫user_id_list.txt,则user_id_list设置代码为: +``` +"user_id_list": "user_id_list.txt", +``` +**设置filter**
+filter控制爬取范围,值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发)。例如,如果要爬全部原创微博,请使用如下代码: +``` +"filter": 1, +``` +**设置since_date**
+since_date值可以是日期,也可以是整数。如果是日期,代表爬取该日期之后的微博,格式应为“yyyy-mm-dd”,如: +``` +"since_date": "2018-01-01", +``` +代表爬取从2018年1月1日到现在的微博。
+如果是整数,代表爬取最近n天的微博,如: +``` +"since_date": 10, +``` +代表爬取最近10天的微博,这个说法不是特别准确,准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。
+**设置write_mode**
+write_mode控制结果文件格式,取值范围是csv、txt、mongo和mysql,分别代表将结果文件写入csv、txt、MongoDB和MySQL数据库。write_mode可以同时包含这些取值中的一个或几个,如: ``` -### 5.设置数据库(可选) +"write_mode": ["csv", "txt"], +``` +代表将结果信息写入csv文件和txt文件。特别注意,如果你想写入数据库,除了在write_mode添加对应数据库的名字外,还应该安装相关数据库和对应python模块,具体操作见[设置数据库](#4设置数据库可选)部分。
+**设置pic_download**
+pic_download控制是否下载微博中的图片,值为1代表下载,值为0代表不下载,如 +``` +"pic_download": 1, +``` +代表下载微博中的图片。
+**设置video_download**
+video_download控制是否下载微博中的视频,值为1代表下载,值为0代表不下载,如 +``` +"video_download": 1, +``` +**设置cookie**
+请按照[如何获取cookie](#如何获取cookie),获取cookie,然后将“your cookie”替换成真实的cookie值。
+**设置mysql_config(可选)**
+mysql_config控制mysql参数配置。如果你不需要将结果信息写入mysql,这个参数可以忽略,即删除或保留都无所谓;如果你需要写入mysql且config.json文件中mysql_config的配置与你的mysql配置不一样,请将该值改成你自己mysql中的参数配置。 + +### 4.设置数据库(可选) 本部分是可选部分,如果不需要将爬取信息写入数据库,可跳过这一步。本程序目前支持MySQL数据库和MongoDB数据库,如果你需要写入其它数据库,可以参考这两个数据库的写法自己编写。
**MySQL数据库写入**
-要想将爬取信息写入MySQL,请将main函数中的mysql_write变量值改为1。再根据自己的系统环境安装MySQL,然后命令行执行: +要想将爬取信息写入MySQL,请根据自己的系统环境安装MySQL,然后命令行执行: ```bash $ pip install pymysql ``` -MySQL写入需要主机、端口号、用户名、密码等配置,本程序默认的配置如下: -```python - mysql_config = { - 'host': 'localhost', - 'port': 3306, - 'user': 'root', - 'password': '123456', - 'charset': 'utf8mb4' - } -``` -如果你的配置和上面不同,需要修改main函数,将本程序的配置改成自己的配置,具体代码如下: -```python - mysql_config = { - 'host': 'xxx', - 'port': xxx, - 'user': 'xxx', - 'password': 'xxx', - 'charset': 'utf8mb4' - } - wb.change_mysql_config(mysql_config) -``` **MongoDB数据库写入**
-要想将爬取信息写入MongoDB,请将main函数中的mongodb_write变量值改为1。再根据自己的系统环境安装MongoDB,然后命令行执行: +要想将爬取信息写入MongoDB,请根据自己的系统环境安装MongoDB,然后命令行执行: ```bash $ pip install pymongo ``` @@ -200,63 +243,15 @@ MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为
-### 6.运行脚本 +### 5.运行脚本 大家可以根据自己的运行环境选择运行方式,Linux可以通过 ```bash -$ python weibospider.py +$ python spider.py ``` 运行; -### 7.按需求修改脚本(可选) -本程序是一个Weibo类,用户可以按照自己的需求调用Weibo类。 -用户可以直接在**weibospider.py**文件中调用Weibo类,具体调用代码示例如下: -```python - # 以下是程序配置信息,可以根据自己需求修改 - filter = 1 # 值为0表示爬取全部微博(原创微博+转发微博),值为1表示只爬取原创微博 - since_date = '2019-07-01' # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd - """值为0代表不将结果写入MongoDB数据库,1代表写入;若要写入MongoDB数据库, - 请先安装MongoDB数据库和pymongo,pymongo安装方法为命令行运行:pip install pymongo""" - mongodb_write = 1 - """值为0代表不将结果写入MySQL数据库,1代表写入;若要写入MySQL数据库, - 请先安装MySQL数据库和pymysql,pymysql安装方法为命令行运行:pip install pymysql""" - mysql_write = 1 - pic_download = 0 # 值为0代表不下载微博原始图片,1代表下载微博原始图片 - video_download = 0 # 值为0代表不下载微博视频,1代表下载微博视频 - - wb = Weibo(filter, since_date, mongodb_write, mysql_write, - pic_download, video_download) - - # 下面是自定义MySQL数据库连接配置(可选) - """因为操作MySQL数据库需要用户名、密码等参数,本程序默认为: - mysql_config = { - 'host': 'localhost', - 'port': 3306, - 'user': 'root', - 'password': '123456', - 'charset': 'utf8mb4' - } - 大家的参数配置如果和默认值不同,可以将上面的参数值替换成自己的, - 然后添加如下代码,使修改生效,如果你的参数和默认值相同则不需要下面的代码: - wb.change_mysql_config(mysql_config)""" - - # 下面是配置user_id_list - """user_id_list包含了要爬的目标微博id,可以是一个,也可以是多个,也可以从文件中读取 - 爬单个微博,user_id_list如下所示,可以改成任意合法的用户id - user_id_list = ['1669879400'] - 爬多个微博,user_id_list如下所示,可以改成任意合法的用户id - user_id_list = ['1223178222', '1669879400', '1729370543'] - 也可以在文件中读取user_id_list,文件中可以包含很多user_id, - 每个user_id占一行,也可以在user_id后面加注释,如用户昵称,user_id和注释之间必需要有空格, - 文件名任意,类型为txt,位置位于本程序的同目录下,文件内容可以为如下形式: - 1223178222 胡歌 - 1669879400 迪丽热巴 - 1729370543 郭碧婷 - 比如文件可以叫user_id_list.txt,读取文件中的user_id_list如下所示: - user_id_list = wb.get_user_list('user_id_list.txt')""" - user_id_list = ['1223178222', '1669879400', '1729370543'] - - wb.start(user_id_list) # 爬取微博信息 -``` -通过执行wb.start() 完成了微博的爬取工作。在上述代码执行后,我们可以得到很多信息:
+### 6.按需求修改脚本(可选) +本程序是一个Weibo类,用户可以按照自己的需求调用或修改。
+通过执行本程序,我们可以得到很多信息:
**wb.nickname**:用户昵称;
**wb.weibo_num**:微博数;
**wb.following**:关注数;
From 0fc271d6878513464877f7e4efd1cc3a91b43a15 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 26 Nov 2019 21:03:14 +0800 Subject: [PATCH 099/363] Update README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9f9fcee9..7ae93e20 100644 --- a/README.md +++ b/README.md @@ -111,7 +111,7 @@ txt文件结果如下所示: >单文件版是所有代码都写到一个文件里,即[weiboSpider.py](https://github.com/dataabc/weiboSpider/blob/master/weiboSpider.py)。多文件版重构了单文件版,按照代码功能分成了几个文件,代码更清晰,更易读。如果你仅仅想使用程序,这两个版本用哪一个都一样;如果你不仅想使用,还想开发新功能,多文件版可能更容易。 多文件版由[songzy12](https://github.com/songzy12)重构。songzy12非常认真负责,对于我发现的问题都很耐心地修复了,而且效率非常高,在此感谢。
-本使用说明是单文件版的使用说明,后续会在[multi-file](https://github.com/dataabc/weiboSpider/tree/multi-file)分支的README文档中提供多文件版的使用说明。 +本使用说明是单文件版的使用说明。 ### 1.下载脚本 ```bash $ git clone https://github.com/dataabc/weibospider.git @@ -192,6 +192,7 @@ video_download控制是否下载微博中的视频,值为1代表下载,值 ``` "video_download": 1, ``` +代表下载微博中的视频。
**设置cookie**
请按照[如何获取cookie](#如何获取cookie),获取cookie,然后将“your cookie”替换成真实的cookie值。
**设置mysql_config(可选)**
@@ -200,7 +201,7 @@ mysql_config控制mysql参数配置。如果你不需要将结果信息写入mys ### 4.设置数据库(可选) 本部分是可选部分,如果不需要将爬取信息写入数据库,可跳过这一步。本程序目前支持MySQL数据库和MongoDB数据库,如果你需要写入其它数据库,可以参考这两个数据库的写法自己编写。
**MySQL数据库写入**
-要想将爬取信息写入MySQL,请据自己的系统环境安装MySQL,然后命令行执行: +要想将爬取信息写入MySQL,请根据自己的系统环境安装MySQL,然后命令行执行: ```bash $ pip install pymysql ``` From d029b89669760bb67cb25498a074fa68c9e4b899 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 26 Nov 2019 21:07:29 +0800 Subject: [PATCH 100/363] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 76bf5eaf..0d886594 100644 --- a/README.md +++ b/README.md @@ -111,8 +111,8 @@ txt文件结果如下所示: >单文件版是所有代码都写到一个文件里,即[weiboSpider.py](https://github.com/dataabc/weiboSpider/blob/master/weiboSpider.py)。多文件版重构了单文件版,按照代码功能分成了几个文件,代码更清晰,更易读。如果你仅仅想使用程序,这两个版本用哪一个都一样;如果你不仅想使用,还想开发新功能,多文件版可能更容易。 多文件版由[songzy12](https://github.com/songzy12)重构。songzy12非常认真负责,对于我发现的问题都很耐心地修复了,而且效率非常高,在此感谢。
-本使用说明是多文件版的使用说明。 -**本版本包含文件及说明如下** +本使用说明是多文件版的使用说明。
+**本版本包含代码文件及说明如下** * config.json: 配置相关文件; * spider.py: 爬虫逻辑主文件; * printer.py: 打印调试信息相关; From a1868a241882749e9d74fc07054cf42d918f9603 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Thu, 28 Nov 2019 02:33:46 +0800 Subject: [PATCH 101/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E8=B7=AF=E5=BE=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index e8b63c5a..fd0d83cd 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -39,6 +39,9 @@ def __init__(self, config): self.mysql_config = config.get('mysql_config') # MySQL数据库连接配置,可以不填 user_id_list = config['user_id_list'] if not isinstance(user_id_list, list): + if not os.path.isabs(user_id_list): + user_id_list = os.path.split( + os.path.realpath(__file__))[0] + os.sep + user_id_list user_id_list = self.get_user_list(user_id_list) self.user_id_list = user_id_list # 要爬取的微博用户的user_id列表 self.user_id = '' # 用户id,如昵称为"Dear-迪丽热巴"的id为'1669879400' @@ -76,6 +79,9 @@ def validate_config(self, config): list)) and (not user_id_list.endswith('.txt')): sys.exit(u'user_id_list值应为list类型或txt文件路径') if not isinstance(user_id_list, list): + if not os.path.isabs(user_id_list): + user_id_list = os.path.split( + os.path.realpath(__file__))[0] + os.sep + user_id_list if not os.path.isfile(user_id_list): sys.exit( u'当前路径:%s 不存在user_id_list.txt文件' % @@ -936,10 +942,12 @@ def start(self): def main(): try: - if not os.path.isfile('./config.json'): + config_path = os.path.split( + os.path.realpath(__file__))[0] + os.sep + 'config.json' + if not os.path.isfile(config_path): sys.exit(u'当前路径:%s 不存在配置文件config.json' % (os.path.split(os.path.realpath(__file__))[0] + os.sep)) - with open('./config.json') as f: + with open(config_path) as f: config = json.loads(f.read()) wb = Weibo(config) wb.start() # 爬取微博信息 From 938ab9572dc276e1a3576d1a3f241f0ff953eb54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Thu, 28 Nov 2019 02:43:00 +0800 Subject: [PATCH 102/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E8=B7=AF=E5=BE=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spider.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/spider.py b/spider.py index 401abf93..2e524c22 100644 --- a/spider.py +++ b/spider.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- +import os import random import sys from datetime import date, datetime, timedelta @@ -26,6 +27,11 @@ def __init__(self, config): for t in self.config['cookie'].split(";") } if type(self.config['user_id_list']) == type(u""): + user_id_list = self.config['user_id_list'] + if not os.path.isabs(user_id_list): + user_id_list = os.path.split( + os.path.realpath(__file__))[0] + os.sep + user_id_list + self.config['user_id_list'] = user_id_list with open(self.config['user_id_list'], 'rb') as f: lines = f.read().splitlines() lines = [line.decode('utf-8') for line in lines] @@ -149,7 +155,12 @@ def start(self): if __name__ == '__main__': import json - with open("./config.json") as f: + config_path = os.path.split( + os.path.realpath(__file__))[0] + os.sep + 'config.json' + if not os.path.isfile(config_path): + sys.exit(u'当前路径:%s 不存在配置文件config.json' % + (os.path.split(os.path.realpath(__file__))[0] + os.sep)) + with open(config_path) as f: config = json.loads(f.read()) spider = Spider(config) spider.start() # 爬取微博信息 From 61abe8c31bf79e093420587e5efa12d200625bf2 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Thu, 28 Nov 2019 02:49:30 +0800 Subject: [PATCH 103/363] Update README.md --- README.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7ae93e20..ac853002 100644 --- a/README.md +++ b/README.md @@ -243,7 +243,20 @@ $ python weibospider.py ``` 运行; ### 6.按需求修改脚本(可选) -本程序是一个Weibo类,用户可以按照自己的需求调用或修改。
+本部分为可选部分,如果你不需要自己修改代码或添加新功能,可以忽略此部分。
+本程序所有代码都位于weiboSpider.py文件,程序主体是一个Weibo类,上述所有功能都是通过在main函数调用Weibo类实现的,默认的调用代码如下: +```python + config_path = os.path.split( + os.path.realpath(__file__))[0] + os.sep + 'config.json' + if not os.path.isfile(config_path): + sys.exit(u'当前路径:%s 不存在配置文件config.json' % + (os.path.split(os.path.realpath(__file__))[0] + os.sep)) + with open(config_path) as f: + config = json.loads(f.read()) + wb = Weibo(config) + wb.start() # 爬取微博信息 +``` +用户可以按照自己的需求调用或修改Weibo类。
通过执行本程序,我们可以得到很多信息:
**wb.nickname**:用户昵称;
**wb.weibo_num**:微博数;
From 7220a82945a31030f696109163aa4c6a3fb909b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Sun, 1 Dec 2019 02:34:53 +0800 Subject: [PATCH 104/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=AF=B9conf?= =?UTF-8?q?ig.json=E6=A0=BC=E5=BC=8F=E7=9A=84=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/weiboSpider.py b/weiboSpider.py index fd0d83cd..aee631f6 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -951,6 +951,9 @@ def main(): config = json.loads(f.read()) wb = Weibo(config) wb.start() # 爬取微博信息 + except ValueError: + print(u'config.json 格式不正确,请参考 ' + u'https://github.com/dataabc/weiboSpider#3程序设置') except Exception as e: print('Error: ', e) traceback.print_exc() From 2ebd45c3671aac30851ebdd12e2426a4a20faed3 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Mon, 16 Dec 2019 22:09:31 +0800 Subject: [PATCH 105/363] Update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ac853002..629e8b5c 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,10 @@ - 写入**csv文件**(默认) - 写入**MySQL数据库**(可选) - 写入**MongoDB数据库**(可选) -- 下载用户微博中的原始**图片**(可选) -- 下载用户微博中的**视频**(可选)
+- 下载用户**原创**微博中的原始**图片**(可选) +- 下载用户**转发**微博中的原始**图片**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) +- 下载用户**原创**微博中的**视频**(可选) +- 下载用户**转发**微博中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有)
本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需免cookie版,大家可以访问,二者功能类似,免cookie版获取的信息更多,用法更简单,而且不需要cookie。
## 输出 From d9c6ac58acaaad5918be21076fa7096085312aa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Wed, 18 Dec 2019 19:53:05 +0800 Subject: [PATCH 106/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E9=83=A8?= =?UTF-8?q?=E5=88=86=E6=96=87=E4=BB=B6=E8=B7=AF=E5=BE=84=E5=87=BA=E9=94=99?= =?UTF-8?q?=E6=97=B6=E4=B8=8D=E4=B8=A5=E8=B0=A8=E7=9A=84=E6=8F=90=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index aee631f6..9b9701f2 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -70,7 +70,7 @@ def validate_config(self, config): sys.exit(u'write_mode值应为list类型') for mode in config['write_mode']: if mode not in write_mode: - sys.exit(u'%s为无效模式,请从txt、csv、mongo和mysql挑选一个或多个作为write_mode' % + sys.exit(u'%s为无效模式,请从txt、csv、mongo和mysql中挑选一个或多个作为write_mode' % mode) # 验证user_id_list @@ -83,9 +83,7 @@ def validate_config(self, config): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list if not os.path.isfile(user_id_list): - sys.exit( - u'当前路径:%s 不存在user_id_list.txt文件' % - (os.path.split(os.path.realpath(__file__))[0] + os.sep)) + sys.exit(u'不存在%s文件' % user_id_list) def is_date(self, since_date): """判断日期格式是否正确""" From a4e9eb721e7867312d4a7852e7756f030b56f201 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 18 Dec 2019 20:16:45 +0800 Subject: [PATCH 107/363] Update README.md --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 629e8b5c..6506cb91 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,8 @@ * [注意事项](#注意事项) ## 功能 -连续爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入**文件**或**数据库**。写入信息几乎包括了用户微博的所有数据,主要有**用户信息**和**微博信息**两大类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[输出](#输出)部分。具体的写入文件类型如下: +连续爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入**文件**或**数据库**。写入信息几乎包括了用户微博的所有数据,主要有**用户信息**和**微博信息**两大类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[输出](#输出)部分。
+具体的写入文件类型如下: - 写入**txt文件**(默认) - 写入**csv文件**(默认) - 写入**MySQL数据库**(可选) @@ -25,7 +26,9 @@ - 下载用户**原创**微博中的**视频**(可选) - 下载用户**转发**微博中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有)
-本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需免cookie版,大家可以访问,二者功能类似,免cookie版获取的信息更多,用法更简单,而且不需要cookie。
+当然,如果你只对用户信息感兴趣,而不需要爬用户的微博,也可以通过设置实现只爬取微博用户信息的功能。
+程序也可以实现**爬取结果自动更新**,即:现在爬取了目标用户的微博,几天之后,目标用户可能又发新微博了。通过设置,可以实现每隔几天增量爬取用户这几天发的新微博。
+本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需[免cookie版](https://github.com/dataabc/weibo-crawler),大家可以访问,二者功能类似,免cookie版获取的信息更多,用法更简单,而且不需要cookie。
## 输出 本部分为爬取到的字段信息说明,为了与[免cookie版](https://github.com/dataabc/weibo-crawler)区分,下面将两者爬取到的信息都列出来。如果是免cookie版所特有的信息,会有免cookie标注,没有标注的为二者共有的信息。
**用户信息** From da32d32727b932d86f88d7b57b9e54e6491a9197 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Sat, 21 Dec 2019 21:40:21 +0800 Subject: [PATCH 108/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E8=BD=AC?= =?UTF-8?q?=E5=8F=91=E5=BE=AE=E5=8D=9A=E6=8F=90=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 主要是当被转发微博被删除或者设置为仅部分时间内可见时,在转发内容里给出转发微博提示 --- weiboSpider.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 9b9701f2..2418a062 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -252,12 +252,6 @@ def get_long_retweet(self, weibo_link): def get_retweet(self, info, weibo_id): """获取转发微博""" try: - original_user = info.xpath("div/span[@class='cmt']/a/text()") - if not original_user: - wb_content = u'转发微博已被删除' - return wb_content - else: - original_user = original_user[0] wb_content = self.deal_garbled(info) wb_content = wb_content[wb_content.find(':') + 1:wb_content.rfind(u'赞')] @@ -270,8 +264,13 @@ def get_retweet(self, info, weibo_id): wb_content = weibo_content retweet_reason = self.deal_garbled(info.xpath('div')[-1]) retweet_reason = retweet_reason[:retweet_reason.rindex(u'赞')] - wb_content = (retweet_reason + '\n' + u'原始用户: ' + original_user + - '\n' + u'转发内容: ' + wb_content) + original_user = info.xpath("div/span[@class='cmt']/a/text()") + if original_user: + original_user = original_user[0] + wb_content = (retweet_reason + '\n' + u'原始用户: ' + + original_user + '\n' + u'转发内容: ' + wb_content) + else: + wb_content = retweet_reason + '\n' + u'转发内容: ' + wb_content return wb_content except Exception as e: print('Error: ', e) From 06094b5e14e4574289553db04f0ee4f48b336e41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Sat, 21 Dec 2019 21:59:50 +0800 Subject: [PATCH 109/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E8=BD=AC?= =?UTF-8?q?=E5=8F=91=E5=BE=AE=E5=8D=9A=E6=8F=90=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- html_parser.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/html_parser.py b/html_parser.py index 3412e4aa..40049f5b 100644 --- a/html_parser.py +++ b/html_parser.py @@ -1,12 +1,12 @@ # -*- coding: UTF-8 -*- +import re import sys -from lxml import etree import traceback -import re -from datetime import datetime, timedelta from collections import OrderedDict +from datetime import datetime, timedelta import requests +from lxml import etree class Parser: @@ -155,12 +155,6 @@ def get_long_retweet(self, weibo_link): def get_retweet(self, info, weibo_id): """获取转发微博""" - original_user = info.xpath("div/span[@class='cmt']/a/text()") - if not original_user: - wb_content = u'转发微博已被删除' - return wb_content - else: - original_user = original_user[0] wb_content = self.deal_garbled(info) wb_content = wb_content[wb_content.find(':') + 1:wb_content.rfind(u'赞')] @@ -173,8 +167,13 @@ def get_retweet(self, info, weibo_id): wb_content = weibo_content retweet_reason = self.deal_garbled(info.xpath('div')[-1]) retweet_reason = retweet_reason[:retweet_reason.rindex(u'赞')] - wb_content = (retweet_reason + '\n' + u'原始用户: ' + original_user + - '\n' + u'转发内容: ' + wb_content) + original_user = info.xpath("div/span[@class='cmt']/a/text()") + if original_user: + original_user = original_user[0] + wb_content = (retweet_reason + '\n' + u'原始用户: ' + original_user + + '\n' + u'转发内容: ' + wb_content) + else: + wb_content = retweet_reason + '\n' + u'转发内容: ' + wb_content return wb_content def is_original(self, info): @@ -206,7 +205,7 @@ def get_publish_place(self, info): if len(weibo_a) >= 1: publish_place = weibo_a[-1] if (u'视频' == div_first.xpath("span[@class='ctt']/a/text()") - [-1][-2:]): + [-1][-2:]): if len(weibo_a) >= 2: publish_place = weibo_a[-2] else: From 594e338386b52e31fabc2f7732244f27c06c548d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Mon, 30 Dec 2019 23:27:57 +0800 Subject: [PATCH 110/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E5=BE=AE?= =?UTF-8?q?=E5=8D=9A=E8=A7=86=E9=A2=91=E4=B8=8B=E8=BD=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 79 ++++++++++++++++++++++++++------------------------ 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 2418a062..aa751110 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -93,7 +93,7 @@ def is_date(self, since_date): except ValueError: return False - def deal_html(self, url): + def handle_html(self, url): """处理html""" try: html = requests.get(url, cookies=self.cookie).content @@ -103,7 +103,7 @@ def deal_html(self, url): print('Error: ', e) traceback.print_exc() - def deal_garbled(self, info): + def handle_garbled(self, info): """处理乱码""" try: info = (info.xpath('string(.)').replace(u'\u200b', '').encode( @@ -117,7 +117,7 @@ def get_nickname(self): """获取用户昵称""" try: url = 'https://weibo.cn/%s/info' % (self.user_id) - selector = self.deal_html(url) + selector = self.handle_html(url) nickname = selector.xpath('//title/text()')[0] nickname = nickname[:-3] if nickname == u'登录 - 新' or nickname == u'新浪': @@ -211,9 +211,9 @@ def get_page_num(self, selector): def get_long_weibo(self, weibo_link): """获取长原创微博""" try: - selector = self.deal_html(weibo_link) + selector = self.handle_html(weibo_link) info = selector.xpath("//div[@class='c']")[1] - wb_content = self.deal_garbled(info) + wb_content = self.handle_garbled(info) wb_time = info.xpath("//span[@class='ct']/text()")[0] weibo_content = wb_content[wb_content.find(':') + 1:wb_content.rfind(wb_time)] @@ -226,7 +226,7 @@ def get_long_weibo(self, weibo_link): def get_original_weibo(self, info, weibo_id): """获取原创微博""" try: - weibo_content = self.deal_garbled(info) + weibo_content = self.handle_garbled(info) weibo_content = weibo_content[:weibo_content.rfind(u'赞')] a_text = info.xpath('div//a/text()') if u'全文' in a_text: @@ -252,7 +252,7 @@ def get_long_retweet(self, weibo_link): def get_retweet(self, info, weibo_id): """获取转发微博""" try: - wb_content = self.deal_garbled(info) + wb_content = self.handle_garbled(info) wb_content = wb_content[wb_content.find(':') + 1:wb_content.rfind(u'赞')] wb_content = wb_content[:wb_content.rfind(u'赞')] @@ -262,7 +262,7 @@ def get_retweet(self, info, weibo_id): weibo_content = self.get_long_retweet(weibo_link) if weibo_content: wb_content = weibo_content - retweet_reason = self.deal_garbled(info.xpath('div')[-1]) + retweet_reason = self.handle_garbled(info.xpath('div')[-1]) retweet_reason = retweet_reason[:retweet_reason.rindex(u'赞')] original_user = info.xpath("div/span[@class='cmt']/a/text()") if original_user: @@ -315,7 +315,7 @@ def get_publish_place(self, info): publish_place = weibo_a[-2] else: publish_place = u'无' - publish_place = self.deal_garbled(publish_place) + publish_place = self.handle_garbled(publish_place) break return publish_place except Exception as e: @@ -326,7 +326,7 @@ def get_publish_time(self, info): """获取微博发布时间""" try: str_time = info.xpath("div/span[@class='ct']") - str_time = self.deal_garbled(str_time[0]) + str_time = self.handle_garbled(str_time[0]) publish_time = str_time.split(u'来自')[0] if u'刚刚' in publish_time: publish_time = datetime.now().strftime('%Y-%m-%d %H:%M') @@ -358,7 +358,7 @@ def get_publish_tool(self, info): """获取微博发布工具""" try: str_time = info.xpath("div/span[@class='ct']") - str_time = self.deal_garbled(str_time[0]) + str_time = self.handle_garbled(str_time[0]) if len(str_time.split(u'来自')) > 1: publish_tool = str_time.split(u'来自')[1] else: @@ -374,7 +374,7 @@ def get_weibo_footer(self, info): footer = {} pattern = r'\d+' str_footer = info.xpath('div')[-1] - str_footer = self.deal_garbled(str_footer) + str_footer = self.handle_garbled(str_footer) str_footer = str_footer[str_footer.rfind(u'赞'):] weibo_footer = re.findall(pattern, str_footer, re.M) @@ -399,7 +399,7 @@ def extract_picture_urls(self, info, weibo_id): all_pic = 'https://weibo.cn/mblog/picAll/' + weibo_id + '?rl=1' if first_pic in a_list: if all_pic in a_list: - selector = self.deal_html(all_pic) + selector = self.handle_html(all_pic) preview_picture_list = selector.xpath('//img/@src') picture_list = [ p.replace('/thumb180/', '/large/') @@ -500,39 +500,42 @@ def download_one_file(self, url, file_path, type, weibo_id): print('Error: ', e) traceback.print_exc() - def download_files(self, type): + def handle_download(self, file_type, file_dir, urls, w): + """处理下载相关操作""" + file_prefix = w['publish_time'][:11].replace('-', '') + '_' + w['id'] + if file_type == 'img': + if ',' in urls: + url_list = urls.split(',') + for i, url in enumerate(url_list): + file_suffix = url[url.rfind('.'):] + file_name = file_prefix + '_' + str(i + 1) + file_suffix + file_path = file_dir + os.sep + file_name + self.download_one_file(url, file_path, file_type, w['id']) + else: + file_suffix = urls[urls.rfind('.'):] + file_name = file_prefix + file_suffix + file_path = file_dir + os.sep + file_name + self.download_one_file(urls, file_path, file_type, w['id']) + else: + file_suffix = '.mp4' + file_name = file_prefix + file_suffix + file_path = file_dir + os.sep + file_name + self.download_one_file(urls, file_path, file_type, w['id']) + + def download_files(self, file_type): """下载文件(图片/视频)""" try: - if type == 'img': + if file_type == 'img': describe = u'图片' key = 'original_pictures' else: describe = u'视频' key = 'video_url' print(u'即将进行%s下载' % describe) - file_dir = self.get_filepath(type) + file_dir = self.get_filepath(file_type) for w in tqdm(self.weibo, desc='Download progress'): if w[key] != u'无': - file_prefix = w['publish_time'][:11].replace( - '-', '') + '_' + w['id'] - if type == 'img' and ',' in w[key]: - w[key] = w[key].split(',') - for j, url in enumerate(w[key]): - file_suffix = url[url.rfind('.'):] - file_name = file_prefix + '_' + str( - j + 1) + file_suffix - file_path = file_dir + os.sep + file_name - self.download_one_file(url, file_path, type, - w['id']) - else: - if type == 'video': - file_suffix = '.mp4' - else: - file_suffix = w[key][w[key].rfind('.'):] - file_name = file_prefix + file_suffix - file_path = file_dir + os.sep + file_name - self.download_one_file(w[key], file_path, type, - w['id']) + self.handle_download(file_type, file_dir, w[key], w) print(u'%s下载完毕,保存路径:' % describe) print(file_dir) except Exception as e: @@ -593,7 +596,7 @@ def get_one_page(self, page): """获取第page页的全部微博""" try: url = 'https://weibo.cn/u/%s?page=%d' % (self.user_id, page) - selector = self.deal_html(url) + selector = self.handle_html(url) info = selector.xpath("//div[@class='c']") is_exist = info[0].xpath("div/span[@class='ctt']") if is_exist: @@ -868,7 +871,7 @@ def get_weibo_info(self): """获取微博信息""" try: url = 'https://weibo.cn/u/%s' % (self.user_id) - selector = self.deal_html(url) + selector = self.handle_html(url) self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 page_num = self.get_page_num(selector) # 获取微博总页数 wrote_num = 0 From d114b75e83c471e73c22d794daafc4fbffaed84c Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Mon, 30 Dec 2019 23:56:22 +0800 Subject: [PATCH 111/363] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6506cb91..49872331 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,9 @@ - 下载用户**原创**微博中的原始**图片**(可选) - 下载用户**转发**微博中的原始**图片**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) - 下载用户**原创**微博中的**视频**(可选) -- 下载用户**转发**微博中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有)
+- 下载用户**转发**微博中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) +- 下载用户**原创**微博**live photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) +- 下载用户**转发**微博**live photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有)
当然,如果你只对用户信息感兴趣,而不需要爬用户的微博,也可以通过设置实现只爬取微博用户信息的功能。
程序也可以实现**爬取结果自动更新**,即:现在爬取了目标用户的微博,几天之后,目标用户可能又发新微博了。通过设置,可以实现每隔几天增量爬取用户这几天发的新微博。
From 502ccf2be5d10d9fe355d0b14fe88aa8ae7fa227 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 31 Dec 2019 00:04:37 +0800 Subject: [PATCH 112/363] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 49872331..4a185396 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,8 @@ - 下载用户**转发**微博中的原始**图片**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) - 下载用户**原创**微博中的**视频**(可选) - 下载用户**转发**微博中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) -- 下载用户**原创**微博**live photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) -- 下载用户**转发**微博**live photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有)
+- 下载用户**原创**微博**Live Photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) +- 下载用户**转发**微博**Live Photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有)
当然,如果你只对用户信息感兴趣,而不需要爬用户的微博,也可以通过设置实现只爬取微博用户信息的功能。
程序也可以实现**爬取结果自动更新**,即:现在爬取了目标用户的微博,几天之后,目标用户可能又发新微博了。通过设置,可以实现每隔几天增量爬取用户这几天发的新微博。
From 8b2f61ad46b8c9c0f4731c2bf940809f4f95624a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Wed, 8 Jan 2020 19:24:56 +0800 Subject: [PATCH 113/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=86=99?= =?UTF-8?q?=E5=85=A5mongo=E4=BC=9A=E6=94=B9=E5=8F=98=E5=8E=9F=E5=A7=8B?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/weiboSpider.py b/weiboSpider.py index aa751110..81df36e3 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -2,6 +2,7 @@ # -*- coding: UTF-8 -*- import codecs +import copy import csv import json import os @@ -738,7 +739,11 @@ def info_to_mongodb(self, collection, info_list): client = MongoClient() db = client['weibo'] collection = db[collection] - for info in info_list: + if len(self.write_mode) > 1: + new_info_list = copy.deepcopy(info_list) + else: + new_info_list = info_list + for info in new_info_list: if not collection.find_one({'id': info['id']}): collection.insert_one(info) else: From bbac5eccfcc022969a80bb31bde026ce91431edd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Wed, 8 Jan 2020 19:28:13 +0800 Subject: [PATCH 114/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96csv=E5=86=99?= =?UTF-8?q?=E5=85=A5=E6=95=88=E7=8E=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weiboSpider.py b/weiboSpider.py index 81df36e3..8f76b507 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -671,7 +671,7 @@ def write_csv(self, wrote_num): if not self.filter: result_headers.insert(3, '被转发微博原始图片url') result_headers.insert(4, '是否为原创微博') - result_data = [w.values() for w in self.weibo][wrote_num:] + result_data = [w.values() for w in self.weibo[wrote_num:]] if sys.version < '3': # python2.x reload(sys) sys.setdefaultencoding('utf-8') From 708aeab201be8065de4c4970cb26bd80bc98d9f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Wed, 8 Jan 2020 20:23:54 +0800 Subject: [PATCH 115/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=86=99?= =?UTF-8?q?=E5=85=A5mongo=E5=92=8Cmysql=E6=97=B6=E6=94=B9=E5=8F=98?= =?UTF-8?q?=E5=8E=9F=E5=A7=8B=E6=95=B0=E6=8D=AE=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- writer.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/writer.py b/writer.py index b688b5bf..c028816c 100644 --- a/writer.py +++ b/writer.py @@ -1,4 +1,6 @@ # -*- coding: UTF-8 -*- + +import copy import csv import os import sys @@ -47,11 +49,18 @@ def __init__(self, config): def write_user(self, user): for writer in self.writers: - writer.write_user(user) + if isinstance(writer, MongoWriter): + writer.write_user(copy.deepcopy(user)) + else: + writer.write_user(user) def write_weibo(self, weibo): for writer in self.writers: - writer.write_weibo(weibo) + if isinstance(writer, MongoWriter) or isinstance( + writer, MysqlWriter): + writer.write_weibo(copy.deepcopy(weibo)) + else: + writer.write_weibo(weibo) class TxtWriter: From da68380d76e78e3a4aff1ee97dce35b6270c8d7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Wed, 8 Jan 2020 20:29:48 +0800 Subject: [PATCH 116/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=86=99?= =?UTF-8?q?=E5=85=A5mysql=E5=BE=AE=E5=8D=9A=E4=BF=A1=E6=81=AF=E6=97=B6?= =?UTF-8?q?=E4=BC=9A=E4=BF=AE=E6=94=B9=E5=8E=9F=E5=A7=8B=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/weiboSpider.py b/weiboSpider.py index 8f76b507..f11c3a28 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -854,7 +854,11 @@ def weibo_to_mysql(self, wrote_num): self.mysql_create_table(mysql_config, create_table) # 在'weibo'表中插入或更新微博数据 weibo_list = [] - for weibo in self.weibo[wrote_num:]: + if len(self.write_mode) > 1: + info_list = copy.deepcopy(self.weibo[wrote_num:]) + else: + info_list = self.weibo[wrote_num:] + for weibo in info_list: weibo['user_id'] = self.user_id weibo_list.append(weibo) self.mysql_insert(mysql_config, 'weibo', weibo_list) From b17de85451fb1874b308286140d9ce18b15b7603 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Thu, 9 Jan 2020 18:24:35 +0800 Subject: [PATCH 117/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=B0=86?= =?UTF-8?q?=E7=BB=93=E6=9E=9C=E4=BF=A1=E6=81=AF=E5=86=99=E5=85=A5json?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index f11c3a28..f31fbe4a 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -31,7 +31,7 @@ def __init__(self, config): since_date = str(date.today() - timedelta(int(since_date))) self.since_date = since_date # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd self.write_mode = config[ - 'write_mode'] # 结果信息保存类型,为list形式,可包含txt、csv、mongo和mysql四种类型 + 'write_mode'] # 结果信息保存类型,为list形式,可包含txt、csv、json、mongo和mysql五种类型 self.pic_download = config[ 'pic_download'] # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 self.video_download = config[ @@ -66,13 +66,14 @@ def validate_config(self, config): sys.exit(u'since_date值应为yyyy-mm-dd形式或整数,请重新输入') # 验证write_mode - write_mode = ['txt', 'csv', 'mongo', 'mysql'] + write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql'] if not isinstance(config['write_mode'], list): sys.exit(u'write_mode值应为list类型') for mode in config['write_mode']: if mode not in write_mode: - sys.exit(u'%s为无效模式,请从txt、csv、mongo和mysql中挑选一个或多个作为write_mode' % - mode) + sys.exit( + u'%s为无效模式,请从txt、csv、json、mongo和mysql中挑选一个或多个作为write_mode' % + mode) # 验证user_id_list user_id_list = config['user_id_list'] @@ -728,6 +729,45 @@ def write_txt(self, wrote_num): print('Error: ', e) traceback.print_exc() + def update_json_data(self, data, weibo_info): + """更新要写入json结果文件中的数据,已经存在于json中的信息更新为最新值,不存在的信息添加到data中""" + data['user'] = self.user + if data.get('weibo'): + is_new = 1 # 待写入微博是否全部为新微博,即待写入微博与json中的数据不重复 + for old in data['weibo']: + if weibo_info[-1]['id'] == old['id']: + is_new = 0 + break + if is_new == 0: + for new in weibo_info: + flag = 1 + for i, old in enumerate(data['weibo']): + if new['id'] == old['id']: + data['weibo'][i] = new + flag = 0 + break + if flag: + data['weibo'].append(new) + else: + data['weibo'] += weibo_info + else: + data['weibo'] = weibo_info + return data + + def write_json(self, wrote_num): + """将爬到的信息写入json文件""" + data = {} + path = self.get_filepath('json') + if os.path.isfile(path): + with codecs.open(path, 'r', encoding="utf-8") as f: + data = json.load(f) + weibo_info = self.weibo[wrote_num:] + data = self.update_json_data(data, weibo_info) + with codecs.open(path, 'w', encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False) + print(u'%d条微博写入json文件完毕,保存路径:' % self.got_num) + print(path) + def info_to_mongodb(self, collection, info_list): """将爬取的信息写入MongoDB数据库""" try: @@ -871,6 +911,8 @@ def write_data(self, wrote_num): self.write_csv(wrote_num) if 'txt' in self.write_mode: self.write_txt(wrote_num) + if 'json' in self.write_mode: + self.write_json(wrote_num) if 'mysql' in self.write_mode: self.weibo_to_mysql(wrote_num) if 'mongo' in self.write_mode: From a1be97350214902eaa08543c208c070392f2230e Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Thu, 9 Jan 2020 18:43:35 +0800 Subject: [PATCH 118/363] Update README.md --- README.md | 58 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 4a185396..acb56ffb 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ 具体的写入文件类型如下: - 写入**txt文件**(默认) - 写入**csv文件**(默认) +- 写入**json文件**(可选) - 写入**MySQL数据库**(可选) - 写入**MongoDB数据库**(可选) - 下载用户**原创**微博中的原始**图片**(可选) @@ -77,7 +78,7 @@ "user_id_list": ["1669879400"], "filter": 1, "since_date": "1900-01-01", - "write_mode": ["csv", "txt"], + "write_mode": ["csv", "txt", "json"], "pic_download": 1, "video_download": 1, "cookie": "your cookie" @@ -85,25 +86,66 @@ ``` 对于上述参数的含义以及取值范围,这里仅作简单介绍,详细信息见[程序设置](#3程序设置)。 ->**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](#如何获取user_id);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件和csv文件,所以它的值为["csv", "txt"],如果你想写入数据库,具体设置见[设置数据库](#4设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[如何获取cookie](#如何获取cookie),获取cookie后把"your cookie"替换成真实的cookie值即可。
+>**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](#如何获取user_id);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件、csv文件和json文件,所以它的值为["csv", "txt", "json"],如果你想写入数据库,具体设置见[设置数据库](#4设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[如何获取cookie](#如何获取cookie),获取cookie后把"your cookie"替换成真实的cookie值即可。
cookie修改完成后运行程序: ```bash $ python weibospider.py ``` -程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](#4设置数据库可选)部分。
+程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个json文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](#4设置数据库可选)部分。

-csv文件结果如下所示: +**csv结果文件如下所示:** ![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv*

-txt文件结果如下所示: +**txt结果文件如下所示:** ![](https://picture.cognize.me/cognize/github/weibospider/weibo_txt.png)*1669879400.txt*

-下载的图片如下所示: +json文件包含迪丽热巴的用户信息和上千条微博信息,内容较多。为了表达清晰,这里仅展示两条微博。
+**json结果文件如下所示:** +``` +{ + "user": { + "nickname": "Dear-迪丽热巴", + "weibo_num": 1086, + "following": 248, + "followers": 65594012, + "id": "1669879400" + }, + "weibo": [ + { + "id": "IonM9ryMy", + "content": "2019#微博之夜#盛典即将开启,以微博之力,让世界更美。1月11日,不见不散@微博之夜  原图 ", + "original_pictures": "http://wx1.sinaimg.cn/large/63885668ly1gao0a01kfzj20ku112k98.jpg", + "video_url": "无", + "publish_place": "无", + "publish_time": "2020-01-07 14:59", + "publish_tool": "无", + "up_num": 239242, + "retweet_num": 71914, + "comment_num": 55916 + }, + { + "id": "InB4Df73X", + "content": "#happyNEOyear#都到了2020,还不换点新pose配新装[來] 穿上@adidasneo 迪士尼联名款,让#生来好动#的我们一起玩“新”大发、自拍不重样🤳http://t.cn/AiF7nREj adidasneo的微博视频  ", + "original_pictures": "无", + "video_url": "http://f.video.weibocdn.com/000pYrGmlx07zPTskBQQ010412008AOY0E010.mp4?label=mp4_hd&template=852x480.25.0&trans_finger=62b30a3f061b162e421008955c73f536&Expires=1578569162&ssig=IV3JEbh3Zu&KID=unistore,video", + "publish_place": "无", + "publish_time": "2020-01-02 11:00", + "publish_tool": "无", + "up_num": 275419, + "retweet_num": 376734, + "comment_num": 131069 + } + ] +} +``` +*1669879400.json*
+
+**下载的图片如下所示:** ![](https://picture.cognize.me/cognize/github/weibospider/img.png)*img文件夹*
本次下载了793张图片,大小一共1.21GB,包括她原创微博中的图片和转发微博转发理由中的图片。图片名为yyyymmdd+微博id的形式,若某条微博存在多张图片,则图片名中还会包括它在微博图片中的序号。若某张图片因为网络等原因下载失败,程序则会以“weibo_id:pic_url”的形式将出错微博id和图片url写入同文件夹下的not_downloaded.txt里;

-下载的视频如下所示: +**下载的视频如下所示:** ![](https://picture.cognize.me/cognize/github/weibospider/video.png)*video文件夹*
本次下载了70个视频,是她原创微博中的视频,视频名为yyyymmdd+微博id的形式。其中有一个视频因为网络原因下载失败,程序将它的微博id和视频url以“weibo_id:video_url”的形式写到了同文件夹下的not_downloaded.txt里。
因为我本地没有安装MySQL数据库和MongoDB数据库,所以暂时设置成不写入数据库。如果你想要将爬取结果写入数据库,只需要先安装数据库(MySQL或MongoDB),再安装对应包(pymysql或pymongo),然后将mysql_write或mongodb_write值设置为1即可。写入MySQL需要用户名、密码等配置信息,这些配置如何设置见[设置数据库](#4设置数据库可选)部分。 @@ -183,7 +225,7 @@ since_date值可以是日期,也可以是整数。如果是日期,代表爬 ``` 代表爬取最近10天的微博,这个说法不是特别准确,准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。
**设置write_mode**
-write_mode控制结果文件格式,取值范围是csv、txt、mongo和mysql,分别代表将结果文件写入csv、txt、MongoDB和MySQL数据库。write_mode可以同时包含这些取值中的一个或几个,如: +write_mode控制结果文件格式,取值范围是csv、txt、json、mongo和mysql,分别代表将结果文件写入csv、txt、json、MongoDB和MySQL数据库。write_mode可以同时包含这些取值中的一个或几个,如: ``` "write_mode": ["csv", "txt"], ``` From cfaf83bb0d8deeda07f5300b74f4b82e992cd0f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Mon, 13 Jan 2020 00:50:06 +0800 Subject: [PATCH 119/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E7=94=A8?= =?UTF-8?q?=E6=88=B7id=E5=92=8C=E7=88=AC=E5=8F=96=E8=B5=B7=E5=A7=8B?= =?UTF-8?q?=E6=97=B6=E5=80=99=E8=AF=BB=E5=8F=96=E6=96=B9=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 当config.json文件中的user_id_list为文件路径时,程序会读取user_id_list文件中的user_id和since_date,如果文件中没有since_date,程序将使用config.json文件中的since_date值。一个用户爬取完成后,会将爬取的用户id、用户昵称和最新一条微博的发布时间写入user_id_list文件。这样的好处是使用者可以不用关心since_date,每次爬取都是重上一次结束的日期开始;可以直接在user_id_list文件中设置since_date,每个用户可以设置不同的since_date,提高了程序的灵活性 Issue #103 --- weiboSpider.py | 93 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 26 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index f31fbe4a..dd919708 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -43,9 +43,16 @@ def __init__(self, config): if not os.path.isabs(user_id_list): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list - user_id_list = self.get_user_list(user_id_list) - self.user_id_list = user_id_list # 要爬取的微博用户的user_id列表 - self.user_id = '' # 用户id,如昵称为"Dear-迪丽热巴"的id为'1669879400' + self.user_config_file_path = user_id_list # 用户配置文件路径 + user_config_list = self.get_user_config_list(user_id_list) + else: + self.user_config_file_path = '' + user_config_list = [{ + 'user_id': user_id, + 'since_date': self.since_date + } for user_id in user_id_list] + self.user_config_list = user_config_list # 要爬取的微博用户的user_config列表 + self.user_config = {} # 用户配置,包含用户id和since_date self.user = {} # 存储爬取到的用户信息 self.got_num = 0 # 存储爬取到的微博数 self.weibo = [] # 存储爬取到的所有微博信息 @@ -118,7 +125,7 @@ def handle_garbled(self, info): def get_nickname(self): """获取用户昵称""" try: - url = 'https://weibo.cn/%s/info' % (self.user_id) + url = 'https://weibo.cn/%s/info' % (self.user_config['user_id']) selector = self.handle_html(url) nickname = selector.xpath('//title/text()')[0] nickname = nickname[:-3] @@ -189,7 +196,7 @@ def get_user_info(self, selector): self.user['weibo_num'] = weibo_num self.user['following'] = following self.user['followers'] = followers - self.user['id'] = self.user_id + self.user['id'] = self.user_config['user_id'] self.print_user_info() self.user_to_database() print('*' * 100) @@ -597,7 +604,8 @@ def is_pinned_weibo(self, info): def get_one_page(self, page): """获取第page页的全部微博""" try: - url = 'https://weibo.cn/u/%s?page=%d' % (self.user_id, page) + url = 'https://weibo.cn/u/%s?page=%d' % ( + self.user_config['user_id'], page) selector = self.handle_html(url) info = selector.xpath("//div[@class='c']") is_exist = info[0].xpath("div/span[@class='ctt']") @@ -609,8 +617,8 @@ def get_one_page(self, page): continue publish_time = datetime.strptime( weibo['publish_time'][:10], "%Y-%m-%d") - since_date = datetime.strptime(self.since_date, - "%Y-%m-%d") + since_date = datetime.strptime( + self.user_config['since_date'], "%Y-%m-%d") if publish_time < since_date: if self.is_pinned_weibo(info[i]): continue @@ -637,7 +645,8 @@ def get_filepath(self, type): os.makedirs(file_dir) if type == 'img' or type == 'video': return file_dir - file_path = file_dir + os.sep + self.user_id + '.' + type + file_path = file_dir + os.sep + self.user_config[ + 'user_id'] + '.' + type return file_path except Exception as e: print('Error: ', e) @@ -707,10 +716,12 @@ def write_txt(self, wrote_num): else: result_header = u'\n\n微博内容: \n' result_header = (u'用户信息\n用户昵称:' + self.user['nickname'] + - u'\n用户id: ' + str(self.user_id) + u'\n微博数: ' + - str(self.user['weibo_num']) + u'\n关注数: ' + - str(self.user['following']) + u'\n粉丝数: ' + - str(self.user['followers']) + result_header) + u'\n用户id: ' + + str(self.user_config['user_id']) + + u'\n微博数: ' + str(self.user['weibo_num']) + + u'\n关注数: ' + str(self.user['following']) + + u'\n粉丝数: ' + str(self.user['followers']) + + result_header) temp_result.append(result_header) for i, w in enumerate(self.weibo[wrote_num:]): temp_result.append( @@ -795,7 +806,7 @@ def weibo_to_mongodb(self, wrote_num): """将爬取的微博信息写入MongoDB数据库""" weibo_list = [] for w in self.weibo[wrote_num:]: - w['user_id'] = self.user_id + w['user_id'] = self.user_config['user_id'] weibo_list.append(w) self.info_to_mongodb('weibo', weibo_list) print(u'%d条微博写入MongoDB数据库完毕' % self.got_num) @@ -899,11 +910,32 @@ def weibo_to_mysql(self, wrote_num): else: info_list = self.weibo[wrote_num:] for weibo in info_list: - weibo['user_id'] = self.user_id + weibo['user_id'] = self.user_config['user_id'] weibo_list.append(weibo) self.mysql_insert(mysql_config, 'weibo', weibo_list) print(u'%d条微博写入MySQL数据库完毕' % self.got_num) + def update_user_config_file(self, user_config_file_path): + """更新用户配置文件""" + with open(user_config_file_path, 'rb') as f: + lines = f.read().splitlines() + lines = [line.decode('utf-8') for line in lines] + for i, line in enumerate(lines): + info = line.split(' ') + if len(info) > 0 and info[0].isdigit(): + if self.user_config['user_id'] == info[0]: + if len(info) == 1: + info.append(self.user['nickname']) + info.append(self.weibo[0]['publish_time'][:10]) + if len(info) == 2: + info.append(self.weibo[0]['publish_time'][:10]) + if len(info) > 2: + info[2] = self.weibo[0]['publish_time'][:10] + lines[i] = ' '.join(info) + break + with codecs.open(user_config_file_path, 'w', encoding="utf-8") as f: + f.write('\n'.join(lines)) + def write_data(self, wrote_num): """将爬取到的信息写入文件或数据库""" if self.got_num > wrote_num: @@ -921,7 +953,7 @@ def write_data(self, wrote_num): def get_weibo_info(self): """获取微博信息""" try: - url = 'https://weibo.cn/u/%s' % (self.user_id) + url = 'https://weibo.cn/u/%s' % (self.user_config['user_id']) selector = self.handle_html(url) self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 page_num = self.get_page_num(selector) # 获取微博总页数 @@ -954,34 +986,43 @@ def get_weibo_info(self): print('Error: ', e) traceback.print_exc() - def get_user_list(self, file_name): + def get_user_config_list(self, file_name): """获取文件中的微博id信息""" with open(file_name, 'rb') as f: lines = f.read().splitlines() lines = [line.decode('utf-8') for line in lines] - user_id_list = [ - line.split(' ')[0] for line in lines - if len(line.split(' ')) > 0 and line.split(' ')[0].isdigit() - ] - return user_id_list + user_config_list = [] + for line in lines: + info = line.split(' ') + if len(info) > 0 and info[0].isdigit(): + user_config = {} + user_config['user_id'] = info[0] + if len(info) > 2 and self.is_date(info[2]): + user_config['since_date'] = info[2] + else: + user_config['since_date'] = self.since_date + user_config_list.append(user_config) + return user_config_list - def initialize_info(self, user_id): + def initialize_info(self, user_config): """初始化爬虫信息""" self.got_num = 0 self.weibo = [] self.user = {} - self.user_id = user_id + self.user_config = user_config self.weibo_id_list = [] def start(self): """运行爬虫""" try: - for user_id in self.user_id_list: - self.initialize_info(user_id) + for user_config in self.user_config_list: + self.initialize_info(user_config) print('*' * 100) self.get_weibo_info() print(u'信息抓取完毕') print('*' * 100) + if self.user_config_file_path: + self.update_user_config_file(self.user_config_file_path) if self.pic_download == 1: self.download_files('img') if self.video_download == 1: From 7c7862e2cbe1182d2f1799a58ee7645fa58b6fb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Mon, 13 Jan 2020 01:01:44 +0800 Subject: [PATCH 120/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E9=83=A8?= =?UTF-8?q?=E5=88=86=E7=BB=93=E6=9E=9C=E6=96=87=E4=BB=B6=E5=86=85=E7=9A=84?= =?UTF-8?q?=E6=97=B6=E9=97=B4=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index dd919708..962e2067 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -924,13 +924,14 @@ def update_user_config_file(self, user_config_file_path): info = line.split(' ') if len(info) > 0 and info[0].isdigit(): if self.user_config['user_id'] == info[0]: + today = datetime.now().strftime('%Y-%m-%d') if len(info) == 1: info.append(self.user['nickname']) - info.append(self.weibo[0]['publish_time'][:10]) + info.append(today) if len(info) == 2: - info.append(self.weibo[0]['publish_time'][:10]) + info.append(today) if len(info) > 2: - info[2] = self.weibo[0]['publish_time'][:10] + info[2] = today lines[i] = ' '.join(info) break with codecs.open(user_config_file_path, 'w', encoding="utf-8") as f: From d547c524c4dd9f9313554c9973ca65f60b180e10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Mon, 13 Jan 2020 18:15:31 +0800 Subject: [PATCH 121/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=86=99?= =?UTF-8?q?=E5=85=A5user=5Fid=5Flist=E6=96=87=E4=BB=B6=E6=97=B6=E9=97=B4?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=8F=AF=E8=83=BD=E5=87=BA=E9=94=99=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #103 --- weiboSpider.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 962e2067..2692d0cd 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -53,6 +53,7 @@ def __init__(self, config): } for user_id in user_id_list] self.user_config_list = user_config_list # 要爬取的微博用户的user_config列表 self.user_config = {} # 用户配置,包含用户id和since_date + self.today = '' # 获取用户第一条微博时的日期 self.user = {} # 存储爬取到的用户信息 self.got_num = 0 # 存储爬取到的微博数 self.weibo = [] # 存储爬取到的所有微博信息 @@ -924,14 +925,13 @@ def update_user_config_file(self, user_config_file_path): info = line.split(' ') if len(info) > 0 and info[0].isdigit(): if self.user_config['user_id'] == info[0]: - today = datetime.now().strftime('%Y-%m-%d') if len(info) == 1: info.append(self.user['nickname']) - info.append(today) + info.append(self.today) if len(info) == 2: - info.append(today) + info.append(self.today) if len(info) > 2: - info[2] = today + info[2] = self.today lines[i] = ' '.join(info) break with codecs.open(user_config_file_path, 'w', encoding="utf-8") as f: @@ -961,6 +961,7 @@ def get_weibo_info(self): wrote_num = 0 page1 = 0 random_pages = random.randint(1, 5) + self.today = datetime.now().strftime('%Y-%m-%d') for page in tqdm(range(1, page_num + 1), desc='Progress'): is_end = self.get_one_page(page) # 获取第page页的全部微博 if is_end: From 51e7b7c251eb1d0c89e2e44334cf0149bc95f099 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Mon, 13 Jan 2020 19:33:06 +0800 Subject: [PATCH 122/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96since=5Fdate?= =?UTF-8?q?=E7=B2=BE=E5=BA=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 最小单位由天优化到分钟,更加精确,效率更高 Issue #103 --- weiboSpider.py | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 2692d0cd..09e86f9b 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -53,7 +53,7 @@ def __init__(self, config): } for user_id in user_id_list] self.user_config_list = user_config_list # 要爬取的微博用户的user_config列表 self.user_config = {} # 用户配置,包含用户id和since_date - self.today = '' # 获取用户第一条微博时的日期 + self.start_time = '' # 获取用户第一条微博时的时间 self.user = {} # 存储爬取到的用户信息 self.got_num = 0 # 存储爬取到的微博数 self.weibo = [] # 存储爬取到的所有微博信息 @@ -98,11 +98,22 @@ def validate_config(self, config): def is_date(self, since_date): """判断日期格式是否正确""" try: - datetime.strptime(since_date, "%Y-%m-%d") + if ':' in since_date: + datetime.strptime(since_date, '%Y-%m-%d %H:%M') + else: + datetime.strptime(since_date, '%Y-%m-%d') return True except ValueError: return False + def str_to_time(self, text): + """将字符串转换成时间类型""" + if ':' in text: + result = datetime.strptime(text, '%Y-%m-%d %H:%M') + else: + result = datetime.strptime(text, '%Y-%m-%d') + return result + def handle_html(self, url): """处理html""" try: @@ -616,10 +627,9 @@ def get_one_page(self, page): if weibo: if weibo['id'] in self.weibo_id_list: continue - publish_time = datetime.strptime( - weibo['publish_time'][:10], "%Y-%m-%d") - since_date = datetime.strptime( - self.user_config['since_date'], "%Y-%m-%d") + publish_time = self.str_to_time(weibo['publish_time']) + since_date = self.str_to_time( + self.user_config['since_date']) if publish_time < since_date: if self.is_pinned_weibo(info[i]): continue @@ -927,14 +937,17 @@ def update_user_config_file(self, user_config_file_path): if self.user_config['user_id'] == info[0]: if len(info) == 1: info.append(self.user['nickname']) - info.append(self.today) + info.append(self.start_time) if len(info) == 2: - info.append(self.today) + info.append(self.start_time) + if len(info) > 3 and self.is_date(info[2] + ' ' + + info[3]): + del info[3] if len(info) > 2: - info[2] = self.today + info[2] = self.start_time lines[i] = ' '.join(info) break - with codecs.open(user_config_file_path, 'w', encoding="utf-8") as f: + with codecs.open(user_config_file_path, 'w', encoding='utf-8') as f: f.write('\n'.join(lines)) def write_data(self, wrote_num): @@ -961,7 +974,7 @@ def get_weibo_info(self): wrote_num = 0 page1 = 0 random_pages = random.randint(1, 5) - self.today = datetime.now().strftime('%Y-%m-%d') + self.start_time = datetime.now().strftime('%Y-%m-%d %H:%M') for page in tqdm(range(1, page_num + 1), desc='Progress'): is_end = self.get_one_page(page) # 获取第page页的全部微博 if is_end: @@ -1000,7 +1013,11 @@ def get_user_config_list(self, file_name): user_config = {} user_config['user_id'] = info[0] if len(info) > 2 and self.is_date(info[2]): - user_config['since_date'] = info[2] + if len(info) > 3 and self.is_date(info[2] + ' ' + + info[3]): + user_config['since_date'] = info[2] + ' ' + info[3] + else: + user_config['since_date'] = info[2] else: user_config['since_date'] = self.since_date user_config_list.append(user_config) From cfbd44a233bcdbb94a8a29bc1ec8c497db68652c Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 14 Jan 2020 21:42:43 +0800 Subject: [PATCH 123/363] Update README.md --- README.md | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index acb56ffb..317e84ec 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ * [设置数据库(可选)](#4设置数据库可选) * [运行脚本](#5运行脚本) * [按需求修改脚本(可选)](#6按需求修改脚本可选) + * [定期自动爬取微博(可选)](#7定期自动爬取微博可选) * [如何获取cookie](#如何获取cookie) * [如何获取user_id](#如何获取user_id) * [注意事项](#注意事项) @@ -30,7 +31,7 @@ - 下载用户**转发**微博**Live Photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有)
当然,如果你只对用户信息感兴趣,而不需要爬用户的微博,也可以通过设置实现只爬取微博用户信息的功能。
-程序也可以实现**爬取结果自动更新**,即:现在爬取了目标用户的微博,几天之后,目标用户可能又发新微博了。通过设置,可以实现每隔几天增量爬取用户这几天发的新微博。
+程序也可以实现**爬取结果自动更新**,即:现在爬取了目标用户的微博,几天之后,目标用户可能又发新微博了。通过设置,可以实现每隔几天**增量爬取**用户这几天发的新微博。具体方法见[定期自动爬取微博](#7定期自动爬取微博可选)。
本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需[免cookie版](https://github.com/dataabc/weibo-crawler),大家可以访问,二者功能类似,免cookie版获取的信息更多,用法更简单,而且不需要cookie。
## 输出 本部分为爬取到的字段信息说明,为了与[免cookie版](https://github.com/dataabc/weibo-crawler)区分,下面将两者爬取到的信息都列出来。如果是免cookie版所特有的信息,会有免cookie标注,没有标注的为二者共有的信息。
@@ -224,6 +225,7 @@ since_date值可以是日期,也可以是整数。如果是日期,代表爬 "since_date": 10, ``` 代表爬取最近10天的微博,这个说法不是特别准确,准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。
+**since_date是所有user的爬取起始时间,非常不灵活。如果你要爬多个用户,并且想单独为每个用户设置一个since_date,可以使用[定期自动爬取微博](#7定期自动爬取微博可选)方法二中的方法,该方法可以为多个用户设置不同的since_date,非常灵活。**
**设置write_mode**
write_mode控制结果文件格式,取值范围是csv、txt、json、mongo和mysql,分别代表将结果文件写入csv、txt、json、MongoDB和MySQL数据库。write_mode可以同时包含这些取值中的一个或几个,如: ``` @@ -330,6 +332,39 @@ $ python weibospider.py +### 7.定期自动爬取微博(可选) +我们爬取了微博以后,很多微博账号又可能发了一些新微博,定期自动爬取微博就是每隔一段时间自动运行程序,自动爬取这段时间产生的新微博(忽略以前爬过的旧微博)。本部分为可选部分,如果不需要可以忽略。
+思路是**利用第三方软件,如crontab,让程序每隔一段时间运行一次**。因为是要跳过以前爬过的旧微博,只爬新微博。所以需要**设置一个动态的since_date**。很多时候我们使用的since_date是固定的,比如since_date="2018-01-01",程序就会按照这个设置从最新的微博一直爬到发布时间为2018-01-01的微博(包括这个时间)。因为我们想追加新微博,跳过旧微博。第二次爬取时since_date值就应该是当前时间到上次爬取的时间。 +如果我们使用最原始的方式实现追加爬取,应该是这样: +``` +假如程序第一次执行时间是2019-06-06,since_date假如为2018-01-01,那这一次就是爬取从2018-01-01到2019-06-06这段时间用户所发的微博; +第二次爬取,我们想要接着上次的爬,那since_date的值应该是上次程序执行的日期,即2019-06-06 +``` +上面的方法太麻烦,因为每次都要手动设置since_date。因此我们需要动态设置since_date,即程序根据实际情况,自动生成since_date。
+有两种方法实现动态更新since_date:
+**方法一:将since_date设置成整数** +将config.json文件中的since_date设置成整数,如: +``` +"since_date": 10, +``` +这个配置告诉程序爬取最近10天的微博,更准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。这样since_date就是一个动态的变量,每次程序执行时,它的值就是当前日期减10。配合crontab每9天或10天执行一次,就实现了定期追加爬取。
+**方法二:将上次执行程序的时间写入文件(推荐)**
+这个方法很简单,就是用户把要爬的用户id写入txt文件,然后再把文件路径赋值给config.json中的user_id_list参数。
+txt文件名格式可以参考[程序设置](#3程序设置)中的**设置user_id_list**,这样设置就全部结束了。
+说下这个方法的好处和原理,假如你的txt文件内容为: +``` +1669879400 +1223178222 胡歌 +1729370543 郭碧婷 2019-01-01 19:28 +``` +第一次执行时,因为第一行和第二行都没有写时间,程序会按照config.json文件中since_date的值爬取,第三行有时间“2019-01-01 19:28”,程序就会把这个时间当作since_date。每个用户爬取结束程序都会自动更新txt文件,每一行第一部分是user_id,第二部分是用户昵称,第三部分是程序**准备**爬取该用户第一条微博(最新微博)时的时间。爬完三个用户后,txt文件的内容自动更新为: +``` +1669879400 Dear-迪丽热巴 2020-01-13 19:18 +1223178222 胡歌 2020-01-13 19:28 +1729370543 郭碧婷 2020-01-13 19:33 +``` +下次再爬取微博的时候,程序会把每行的时间数据作为since_date。这样的好处一是不用修改since_date,程序自动更新;二是每一个用户都可以单独拥有只属于自己的since_date,每个用户的since_date相互独立,互不干扰。since_date既可以是“yyyy-mm-dd”格式,也可以是“yyyy-mm-dd hh:mm”格式。比如,现在又添加了一个新用户,因为是新用户,你想爬该用户的全部微博,只需要将该用户user_id所在行的时间部分设置为“1900-01-01”就好,即user_id 昵称 1900-01-01。
+推荐第二种方法,本方法是[Evifly](https://github.com/Evifly)想出的,非常热心非常有想法的网友,在此感谢。
## 如何获取cookie 1.用Chrome打开
2.输入微博的用户名、密码,登录,如图所示: From 620fc637b00661d5c5b4d61b07f772d85dc138ee Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sun, 19 Jan 2020 18:42:22 +0800 Subject: [PATCH 124/363] Update README.md --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 317e84ec..d077ef02 100644 --- a/README.md +++ b/README.md @@ -363,7 +363,15 @@ txt文件名格式可以参考[程序设置](#3程序设置)中的**设置user_i 1223178222 胡歌 2020-01-13 19:28 1729370543 郭碧婷 2020-01-13 19:33 ``` -下次再爬取微博的时候,程序会把每行的时间数据作为since_date。这样的好处一是不用修改since_date,程序自动更新;二是每一个用户都可以单独拥有只属于自己的since_date,每个用户的since_date相互独立,互不干扰。since_date既可以是“yyyy-mm-dd”格式,也可以是“yyyy-mm-dd hh:mm”格式。比如,现在又添加了一个新用户,因为是新用户,你想爬该用户的全部微博,只需要将该用户user_id所在行的时间部分设置为“1900-01-01”就好,即user_id 昵称 1900-01-01。
+下次再爬取微博的时候,程序会把每行的时间数据作为since_date。这样的好处一是不用修改since_date,程序自动更新;二是每一个用户都可以单独拥有只属于自己的since_date,每个用户的since_date相互独立,互不干扰。since_date既可以是“yyyy-mm-dd”格式,也可以是“yyyy-mm-dd hh:mm”格式。比如,现在又添加了一个新用户,例如杨紫,你想获取她从2018-01-23到现在的全部微博,只需要这样修改txt文件: +``` +1669879400 Dear-迪丽热巴 2020-01-13 19:18 +1223178222 胡歌 2020-01-13 19:28 +1729370543 郭碧婷 2020-01-13 19:33 +1227368500 杨紫 2018-01-23 +``` +注意每一行的用户配置参数以空格分隔,如果第一个参数全部由数字组成,程序就认为此行为一个用户的配置,否则程序会认为该行只是注释,跳过该行;第二个参数可以为任意格式,建议写用户昵称;第三个如果是日期格式(yyyy-mm-dd),程序就将该日期设置为用户自己的since_date,否则使用config.json中的since_date爬取该用户的微博,第二个参数和第三个参数也可以不填。 + 推荐第二种方法,本方法是[Evifly](https://github.com/Evifly)想出的,非常热心非常有想法的网友,在此感谢。
## 如何获取cookie 1.用Chrome打开
From 506d837ad6ea38fc3927f30077d2426281ceaa8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Mon, 20 Jan 2020 18:43:25 +0800 Subject: [PATCH 125/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96user=5Fid?= =?UTF-8?q?=E8=AF=BB=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 当使用者将个性域名等错认为user_id时,程序可以自动修复为正确的user_id --- weiboSpider.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 09e86f9b..69d02860 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -197,9 +197,21 @@ def print_user_info(self): print(u'关注数: %d' % self.user['following']) print(u'粉丝数: %d' % self.user['followers']) + def update_user_id(self, selector): + """更新用户id,使用者输入的user_id不一定是正确的,可能是个性域名等,需要更新成真正的user_id""" + url_list = selector.xpath("//div[@class='u']//a") + for url in url_list: + if (url.xpath('string(.)')) == u'资料': + if url.xpath('@href') and url.xpath('@href')[0].endswith( + '/info'): + link = url.xpath('@href')[0] + self.user_config['user_id'] = link[1:-5] + break + def get_user_info(self, selector): """获取用户昵称、微博数、关注数、粉丝数""" try: + self.update_user_id(selector) self.get_nickname() # 获取用户昵称 user_info = selector.xpath("//div[@class='tip2']/*/text()") weibo_num = int(user_info[0][3:-1]) @@ -616,8 +628,8 @@ def is_pinned_weibo(self, info): def get_one_page(self, page): """获取第page页的全部微博""" try: - url = 'https://weibo.cn/u/%s?page=%d' % ( - self.user_config['user_id'], page) + url = 'https://weibo.cn/%s?page=%d' % (self.user_config['user_id'], + page) selector = self.handle_html(url) info = selector.xpath("//div[@class='c']") is_exist = info[0].xpath("div/span[@class='ctt']") @@ -967,7 +979,7 @@ def write_data(self, wrote_num): def get_weibo_info(self): """获取微博信息""" try: - url = 'https://weibo.cn/u/%s' % (self.user_config['user_id']) + url = 'https://weibo.cn/%s' % (self.user_config['user_id']) selector = self.handle_html(url) self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 page_num = self.get_page_num(selector) # 获取微博总页数 From 6b3dfbb0b3b1574356accee2e0ecce94244c3081 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Thu, 23 Jan 2020 18:46:49 +0800 Subject: [PATCH 126/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=9C=A8?= =?UTF-8?q?=E8=BE=93=E5=85=A5=E4=B8=8D=E6=AD=A3=E7=A1=AE=E7=9A=84user=5Fid?= =?UTF-8?q?(=E4=B8=AA=E6=80=A7=E5=9F=9F=E5=90=8D=E7=AD=89)=E6=97=B6?= =?UTF-8?q?=E6=97=A0=E6=B3=95=E6=9B=B4=E6=96=B0=E5=AF=B9=E5=BA=94since=5Fd?= =?UTF-8?q?ate=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 69d02860..0ff39923 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -48,7 +48,7 @@ def __init__(self, config): else: self.user_config_file_path = '' user_config_list = [{ - 'user_id': user_id, + 'user_uri': user_id, 'since_date': self.since_date } for user_id in user_id_list] self.user_config_list = user_config_list # 要爬取的微博用户的user_config列表 @@ -144,7 +144,7 @@ def get_nickname(self): if nickname == u'登录 - 新' or nickname == u'新浪': self.write_log() sys.exit(u'cookie错误或已过期,请按照README中方法重新获取') - self.user['nickname'] = nickname + return nickname except Exception as e: print('Error: ', e) traceback.print_exc() @@ -197,8 +197,9 @@ def print_user_info(self): print(u'关注数: %d' % self.user['following']) print(u'粉丝数: %d' % self.user['followers']) - def update_user_id(self, selector): - """更新用户id,使用者输入的user_id不一定是正确的,可能是个性域名等,需要更新成真正的user_id""" + def get_user_id(self, selector): + """获取用户id,使用者输入的user_id不一定是正确的,可能是个性域名等,需要获取真正的user_id""" + self.user_config['user_id'] = self.user_config['user_uri'] url_list = selector.xpath("//div[@class='u']//a") for url in url_list: if (url.xpath('string(.)')) == u'资料': @@ -207,12 +208,13 @@ def update_user_id(self, selector): link = url.xpath('@href')[0] self.user_config['user_id'] = link[1:-5] break + return self.user_config['user_id'] def get_user_info(self, selector): - """获取用户昵称、微博数、关注数、粉丝数""" + """获取用户id、昵称、微博数、关注数、粉丝数""" try: - self.update_user_id(selector) - self.get_nickname() # 获取用户昵称 + self.user['id'] = self.get_user_id(selector) + self.user['nickname'] = self.get_nickname() # 获取用户昵称 user_info = selector.xpath("//div[@class='tip2']/*/text()") weibo_num = int(user_info[0][3:-1]) following = int(user_info[1][3:-1]) @@ -220,10 +222,10 @@ def get_user_info(self, selector): self.user['weibo_num'] = weibo_num self.user['following'] = following self.user['followers'] = followers - self.user['id'] = self.user_config['user_id'] self.print_user_info() self.user_to_database() print('*' * 100) + return self.user except Exception as e: print('Error: ', e) traceback.print_exc() @@ -628,8 +630,8 @@ def is_pinned_weibo(self, info): def get_one_page(self, page): """获取第page页的全部微博""" try: - url = 'https://weibo.cn/%s?page=%d' % (self.user_config['user_id'], - page) + url = 'https://weibo.cn/%s?page=%d' % ( + self.user_config['user_uri'], page) selector = self.handle_html(url) info = selector.xpath("//div[@class='c']") is_exist = info[0].xpath("div/span[@class='ctt']") @@ -946,7 +948,7 @@ def update_user_config_file(self, user_config_file_path): for i, line in enumerate(lines): info = line.split(' ') if len(info) > 0 and info[0].isdigit(): - if self.user_config['user_id'] == info[0]: + if self.user_config['user_uri'] == info[0]: if len(info) == 1: info.append(self.user['nickname']) info.append(self.start_time) @@ -979,7 +981,7 @@ def write_data(self, wrote_num): def get_weibo_info(self): """获取微博信息""" try: - url = 'https://weibo.cn/%s' % (self.user_config['user_id']) + url = 'https://weibo.cn/%s' % (self.user_config['user_uri']) selector = self.handle_html(url) self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 page_num = self.get_page_num(selector) # 获取微博总页数 @@ -1023,7 +1025,7 @@ def get_user_config_list(self, file_name): info = line.split(' ') if len(info) > 0 and info[0].isdigit(): user_config = {} - user_config['user_id'] = info[0] + user_config['user_uri'] = info[0] if len(info) > 2 and self.is_date(info[2]): if len(info) > 3 and self.is_date(info[2] + ' ' + info[3]): From 06daa2e4098a525d4a5df2bd3ea1d32acd73086b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Mon, 27 Jan 2020 15:16:39 +0800 Subject: [PATCH 127/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=9B=A0user?= =?UTF-8?q?=5Fid=5Flist.txt=E7=BC=96=E7=A0=81=E6=A0=BC=E5=BC=8F=E4=B8=8D?= =?UTF-8?q?=E6=AD=A3=E7=A1=AE=E7=A8=8B=E5=BA=8F=E5=87=BA=E9=94=99=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit user_id_list.txt文件应为utf-8编码 Issue #108 --- weiboSpider.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 0ff39923..c25ff927 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -1018,8 +1018,11 @@ def get_weibo_info(self): def get_user_config_list(self, file_name): """获取文件中的微博id信息""" with open(file_name, 'rb') as f: - lines = f.read().splitlines() - lines = [line.decode('utf-8') for line in lines] + try: + lines = f.read().splitlines() + lines = [line.decode('utf-8') for line in lines] + except UnicodeDecodeError: + sys.exit(u'%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序' % file_name) user_config_list = [] for line in lines: info = line.split(' ') From a176c55cd4d681ce995a5312af656eb2857b155d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Mon, 27 Jan 2020 17:38:29 +0800 Subject: [PATCH 128/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96config.json?= =?UTF-8?q?=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 以前user_id_list.txt文件编码不正确程序会误认为config.json格式不正确,现在已修复,只有真正config.json格式错误才会报config.json相关错误 Issue #108 --- weiboSpider.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index c25ff927..2c5b01fa 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -1076,12 +1076,13 @@ def main(): sys.exit(u'当前路径:%s 不存在配置文件config.json' % (os.path.split(os.path.realpath(__file__))[0] + os.sep)) with open(config_path) as f: - config = json.loads(f.read()) + try: + config = json.loads(f.read()) + except ValueError: + sys.exit(u'config.json 格式不正确,请参考 ' + u'https://github.com/dataabc/weiboSpider#3程序设置') wb = Weibo(config) wb.start() # 爬取微博信息 - except ValueError: - print(u'config.json 格式不正确,请参考 ' - u'https://github.com/dataabc/weiboSpider#3程序设置') except Exception as e: print('Error: ', e) traceback.print_exc() From 604ee21f00c35b51e2d5b03e69d35b2e5818766e Mon Sep 17 00:00:00 2001 From: duangan1 <58961405+duangan1@users.noreply.github.com> Date: Tue, 28 Jan 2020 12:34:08 +0800 Subject: [PATCH 129/363] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E7=94=B1windows?= =?UTF-8?q?=E8=87=AA=E5=B8=A6=E7=BC=96=E8=BE=91=E5=99=A8=E8=AE=BE=E7=BD=AE?= =?UTF-8?q?user=5Fid=5Flist.txt=E7=9A=84=E4=B8=80=E4=BA=9B=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 2c5b01fa..dd2c6e2c 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -944,7 +944,7 @@ def update_user_config_file(self, user_config_file_path): """更新用户配置文件""" with open(user_config_file_path, 'rb') as f: lines = f.read().splitlines() - lines = [line.decode('utf-8') for line in lines] + lines = [line.decode('utf-8-sig') for line in lines] for i, line in enumerate(lines): info = line.split(' ') if len(info) > 0 and info[0].isdigit(): @@ -1020,7 +1020,7 @@ def get_user_config_list(self, file_name): with open(file_name, 'rb') as f: try: lines = f.read().splitlines() - lines = [line.decode('utf-8') for line in lines] + lines = [line.decode('utf-8-sig') for line in lines] except UnicodeDecodeError: sys.exit(u'%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序' % file_name) user_config_list = [] From 5d3a2b60c4608c3f1d520efd1f58cbd6bf67e51d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Thu, 30 Jan 2020 17:56:02 +0800 Subject: [PATCH 130/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E6=89=93?= =?UTF-8?q?=E5=8D=B0=E5=BE=AE=E5=8D=9Aurl=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 1 + 1 file changed, 1 insertion(+) diff --git a/weiboSpider.py b/weiboSpider.py index dd2c6e2c..cbcbf2d4 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -618,6 +618,7 @@ def print_one_weibo(self, weibo): print(u'点赞数:%d' % weibo['up_num']) print(u'转发数:%d' % weibo['retweet_num']) print(u'评论数:%d' % weibo['comment_num']) + print(u'url:https://weibo.cn/comment/%s' % weibo['id']) def is_pinned_weibo(self, info): """判断微博是否为置顶微博""" From 22b443e921c80ef068cbe34cd38eced6c4da16c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Thu, 30 Jan 2020 18:26:19 +0800 Subject: [PATCH 131/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=BE=AE?= =?UTF-8?q?=E5=8D=9A=E7=94=A8=E6=88=B7url=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 1 + 1 file changed, 1 insertion(+) diff --git a/weiboSpider.py b/weiboSpider.py index cbcbf2d4..a1128a85 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -196,6 +196,7 @@ def print_user_info(self): print(u'微博数: %d' % self.user['weibo_num']) print(u'关注数: %d' % self.user['following']) print(u'粉丝数: %d' % self.user['followers']) + print(u'url:https://weibo.cn/%s' % self.user['id']) def get_user_id(self, selector): """获取用户id,使用者输入的user_id不一定是正确的,可能是个性域名等,需要获取真正的user_id""" From 5a012c97e4955a5c0a45e90b23fb38fcb8f1909f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Fri, 31 Jan 2020 12:58:46 +0800 Subject: [PATCH 132/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E5=BE=AE?= =?UTF-8?q?=E5=8D=9A=E8=BF=9B=E7=A8=8B=E6=98=BE=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 主要是添加显示当前用户的昵称、id和当前微博页数 --- weiboSpider.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/weiboSpider.py b/weiboSpider.py index a1128a85..007ed1e0 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -650,12 +650,19 @@ def get_one_page(self, page): if self.is_pinned_weibo(info[i]): continue else: + print(u'{}已获取{}({})的第{}页微博{}'.format( + '-' * 30, self.user['nickname'], + self.user['id'], page, '-' * 30)) return True self.print_one_weibo(weibo) self.weibo.append(weibo) self.weibo_id_list.append(weibo['id']) self.got_num += 1 print('-' * 100) + print(u'{}已获取{}({})的第{}页微博{}'.format('-' * 30, + self.user['nickname'], + self.user['id'], page, + '-' * 30)) except Exception as e: print('Error: ', e) traceback.print_exc() From c2ad2786d871ef028f3cd025675e9b7dc0e1dedd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Sat, 1 Feb 2020 12:18:17 +0800 Subject: [PATCH 133/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E4=B8=8B=E8=BD=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 007ed1e0..e7f99836 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -558,7 +558,7 @@ def handle_download(self, file_type, file_dir, urls, w): file_path = file_dir + os.sep + file_name self.download_one_file(urls, file_path, file_type, w['id']) - def download_files(self, file_type): + def download_files(self, file_type, wrote_num): """下载文件(图片/视频)""" try: if file_type == 'img': @@ -569,7 +569,7 @@ def download_files(self, file_type): key = 'video_url' print(u'即将进行%s下载' % describe) file_dir = self.get_filepath(file_type) - for w in tqdm(self.weibo, desc='Download progress'): + for w in tqdm(self.weibo[wrote_num:], desc='Download progress'): if w[key] != u'无': self.handle_download(file_type, file_dir, w[key], w) print(u'%s下载完毕,保存路径:' % describe) @@ -986,6 +986,10 @@ def write_data(self, wrote_num): self.weibo_to_mysql(wrote_num) if 'mongo' in self.write_mode: self.weibo_to_mongodb(wrote_num) + if self.pic_download == 1: + self.download_files('img', wrote_num) + if self.video_download == 1: + self.download_files('video', wrote_num) def get_weibo_info(self): """获取微博信息""" @@ -1068,10 +1072,6 @@ def start(self): print('*' * 100) if self.user_config_file_path: self.update_user_config_file(self.user_config_file_path) - if self.pic_download == 1: - self.download_files('img') - if self.video_download == 1: - self.download_files('video') except Exception as e: print('Error: ', e) traceback.print_exc() From a5be279e7f514158e70a50278132d762f84ad7b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Tue, 4 Feb 2020 13:29:50 +0800 Subject: [PATCH 134/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=9C=A8?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E7=88=AC=E5=8F=96=E8=B5=B7=E5=A7=8B=E9=A1=B5?= =?UTF-8?q?=E7=A0=81=E5=90=8E=E5=8F=AF=E8=83=BD=E5=AF=BC=E8=87=B4=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E9=9A=8F=E6=9C=BA=E7=AD=89=E5=BE=85=E7=9A=84=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 程序默认从微博第一页开始爬取,每爬取1到5页随机sleep6到10秒。当修改后起始页码为大于5的值时,导致page-page1大于5,随着循环,page-page1会越来越大,永远不会满足随机等待的条件。现在通过将page-page1对random_pages取余,可以完全修复此问题 --- weiboSpider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weiboSpider.py b/weiboSpider.py index e7f99836..75ebfcee 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -1014,7 +1014,7 @@ def get_weibo_info(self): # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限 # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默 # 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间 - if page - page1 == random_pages and page < page_num: + if (page - page1) % random_pages == 0 and page < page_num: sleep(random.randint(6, 10)) page1 = page random_pages = random.randint(1, 5) From c23081d146e22f78c4731880a3dbff871f4a0aac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Thu, 6 Feb 2020 12:58:40 +0800 Subject: [PATCH 135/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E9=95=BF?= =?UTF-8?q?=E5=BE=AE=E5=8D=9A=E8=8E=B7=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 因为各种原因,长微博可能第一次获取失败,为了获取完整长微博,程序会尝试获取,失败后会再次获取。为了不影响后面微博的获取,长微博最多尝试5次 Issue #44 --- weiboSpider.py | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 75ebfcee..5c763a30 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -247,13 +247,17 @@ def get_page_num(self, selector): def get_long_weibo(self, weibo_link): """获取长原创微博""" try: - selector = self.handle_html(weibo_link) - info = selector.xpath("//div[@class='c']")[1] - wb_content = self.handle_garbled(info) - wb_time = info.xpath("//span[@class='ct']/text()")[0] - weibo_content = wb_content[wb_content.find(':') + - 1:wb_content.rfind(wb_time)] - return weibo_content + for i in range(5): + selector = self.handle_html(weibo_link) + if selector is not None: + info = selector.xpath("//div[@class='c']")[1] + wb_content = self.handle_garbled(info) + wb_time = info.xpath("//span[@class='ct']/text()")[0] + weibo_content = wb_content[wb_content.find(':') + + 1:wb_content.rfind(wb_time)] + if weibo_content is not None: + return weibo_content + sleep(random.randint(6, 10)) except Exception as e: return u'网络出错' print('Error: ', e) @@ -288,26 +292,28 @@ def get_long_retweet(self, weibo_link): def get_retweet(self, info, weibo_id): """获取转发微博""" try: - wb_content = self.handle_garbled(info) - wb_content = wb_content[wb_content.find(':') + - 1:wb_content.rfind(u'赞')] - wb_content = wb_content[:wb_content.rfind(u'赞')] + weibo_content = self.handle_garbled(info) + weibo_content = weibo_content[weibo_content.find(':') + + 1:weibo_content.rfind(u'赞')] + weibo_content = weibo_content[:weibo_content.rfind(u'赞')] a_text = info.xpath('div//a/text()') if u'全文' in a_text: weibo_link = 'https://weibo.cn/comment/' + weibo_id - weibo_content = self.get_long_retweet(weibo_link) - if weibo_content: - wb_content = weibo_content + wb_content = self.get_long_retweet(weibo_link) + if wb_content: + weibo_content = wb_content retweet_reason = self.handle_garbled(info.xpath('div')[-1]) retweet_reason = retweet_reason[:retweet_reason.rindex(u'赞')] original_user = info.xpath("div/span[@class='cmt']/a/text()") if original_user: original_user = original_user[0] - wb_content = (retweet_reason + '\n' + u'原始用户: ' + - original_user + '\n' + u'转发内容: ' + wb_content) + weibo_content = (retweet_reason + '\n' + u'原始用户: ' + + original_user + '\n' + u'转发内容: ' + + weibo_content) else: - wb_content = retweet_reason + '\n' + u'转发内容: ' + wb_content - return wb_content + weibo_content = (retweet_reason + '\n' + u'转发内容: ' + + weibo_content) + return weibo_content except Exception as e: print('Error: ', e) traceback.print_exc() From 9a96acbb449f273594ac3fce87afd762d92c006a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Sun, 9 Feb 2020 10:16:30 +0800 Subject: [PATCH 136/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=9B=A0?= =?UTF-8?q?=E5=9B=BE=E7=89=87=E9=93=BE=E6=8E=A5=E8=BF=87=E9=95=BF=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E5=86=99=E5=85=A5=E6=95=B0=E6=8D=AE=E5=BA=93=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #112 --- weiboSpider.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 5c763a30..6846d78f 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -930,8 +930,8 @@ def weibo_to_mysql(self, wrote_num): id varchar(10) NOT NULL, user_id varchar(12), content varchar(2000), - original_pictures varchar(1000), - retweet_pictures varchar(1000), + original_pictures varchar(3000), + retweet_pictures varchar(3000), original BOOLEAN NOT NULL DEFAULT 1, video_url varchar(300), publish_place varchar(100), From c746123a2fa485d16e0d4840bf2327c680d0d012 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Fri, 21 Feb 2020 10:42:05 +0800 Subject: [PATCH 137/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E9=83=A8?= =?UTF-8?q?=E5=88=86=E5=BE=AE=E5=8D=9A=E6=97=A0=E6=B3=95=E8=8E=B7=E5=8F=96?= =?UTF-8?q?=E5=9B=BE=E7=89=87=E4=BF=A1=E6=81=AF=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #106 --- weiboSpider.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 6846d78f..bb2bbfda 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -439,6 +439,7 @@ def extract_picture_urls(self, info, weibo_id): a_list = info.xpath('div/a/@href') first_pic = 'https://weibo.cn/mblog/pic/' + weibo_id + '?rl=0' all_pic = 'https://weibo.cn/mblog/picAll/' + weibo_id + '?rl=1' + picture_urls = u'无' if first_pic in a_list: if all_pic in a_list: selector = self.handle_html(all_pic) @@ -450,16 +451,20 @@ def extract_picture_urls(self, info, weibo_id): picture_urls = ','.join(picture_list) else: if info.xpath('.//img/@src'): - preview_picture = info.xpath('.//img/@src')[-1] - picture_urls = preview_picture.replace( - '/wap180/', '/large/') + for link in info.xpath('div/a'): + if len(link.xpath('@href')) > 0: + if first_pic == link.xpath('@href')[0]: + if len(link.xpath('img/@src')) > 0: + preview_picture = link.xpath( + 'img/@src')[0] + picture_urls = preview_picture.replace( + '/wap180/', '/large/') + break else: sys.exit( u"爬虫微博可能被设置成了'不显示图片',请前往" u"'https://weibo.cn/account/customize/pic',修改为'显示'" ) - else: - picture_urls = u'无' return picture_urls except Exception as e: return u'无' From f471a295e7c85b70a45505ab9b37e950441eb41a Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Fri, 28 Feb 2020 11:58:58 +0800 Subject: [PATCH 138/363] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d077ef02..1ebd43cf 100644 --- a/README.md +++ b/README.md @@ -392,5 +392,5 @@ txt文件名格式可以参考[程序设置](#3程序设置)中的**设置user_i 事实上,此微博的user_id也包含在用户主页()中,之所以我们还要点击主页中的"资料"来获取user_id,是因为很多用户的主页不是""的形式,而是""或""的形式。其中"微号"和user_id都是一串数字,如果仅仅通过主页地址提取user_id,很容易将"微号"误认为user_id。 ## 注意事项 -1.user_id不能为爬虫微博的user_id。因为要爬微博信息,必须先登录到某个微博账号,此账号我们姑且称为爬虫微博。爬虫微博访问自己的页面和访问其他用户的页面,得到的网页格式不同,所以无法爬取自己的微博信息;
+1.user_id不能为爬虫微博的user_id。因为要爬微博信息,必须先登录到某个微博账号,此账号我们姑且称为爬虫微博。爬虫微博访问自己的页面和访问其他用户的页面,得到的网页格式不同,所以无法爬取自己的微博信息;如果想要爬取爬虫微博内容,可以参考[获取自身微博信息](https://github.com/dataabc/weiboSpider/issues/113)。
2.cookie有期限限制,超过有效期需重新更新cookie。 From 9df9d33c2ed417115198ea9ccc8def9296f0b0d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Mon, 2 Mar 2020 19:19:52 +0800 Subject: [PATCH 139/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E5=BE=AE?= =?UTF-8?q?=E5=8D=9A=E7=94=A8=E6=88=B7=E4=BF=A1=E6=81=AF=E8=8E=B7=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 当since_date比程序运行时间还新时,代表since_date是未来时间,微博发布时间不满足since_date,因此在这种情况省略微博获取,只获取用户信息,加快用户信息获取速度 --- weiboSpider.py | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index bb2bbfda..9a5a892c 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -1008,29 +1008,33 @@ def get_weibo_info(self): url = 'https://weibo.cn/%s' % (self.user_config['user_uri']) selector = self.handle_html(url) self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 - page_num = self.get_page_num(selector) # 获取微博总页数 - wrote_num = 0 - page1 = 0 - random_pages = random.randint(1, 5) - self.start_time = datetime.now().strftime('%Y-%m-%d %H:%M') - for page in tqdm(range(1, page_num + 1), desc='Progress'): - is_end = self.get_one_page(page) # 获取第page页的全部微博 - if is_end: - break + since_date = self.str_to_time(self.user_config['since_date']) + now = datetime.now().strftime('%Y-%m-%d %H:%M') + now = datetime.strptime(now, '%Y-%m-%d %H:%M') + if since_date <= now: + page_num = self.get_page_num(selector) # 获取微博总页数 + wrote_num = 0 + page1 = 0 + random_pages = random.randint(1, 5) + self.start_time = datetime.now().strftime('%Y-%m-%d %H:%M') + for page in tqdm(range(1, page_num + 1), desc='Progress'): + is_end = self.get_one_page(page) # 获取第page页的全部微博 + if is_end: + break - if page % 20 == 0: # 每爬20页写入一次文件 - self.write_data(wrote_num) - wrote_num = self.got_num + if page % 20 == 0: # 每爬20页写入一次文件 + self.write_data(wrote_num) + wrote_num = self.got_num - # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限 - # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默 - # 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间 - if (page - page1) % random_pages == 0 and page < page_num: - sleep(random.randint(6, 10)) - page1 = page - random_pages = random.randint(1, 5) + # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限 + # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默 + # 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间 + if (page - page1) % random_pages == 0 and page < page_num: + sleep(random.randint(6, 10)) + page1 = page + random_pages = random.randint(1, 5) - self.write_data(wrote_num) # 将剩余不足20页的微博写入文件 + self.write_data(wrote_num) # 将剩余不足20页的微博写入文件 if not self.filter: print(u'共爬取' + str(self.got_num) + u'条微博') else: From d600e9b6c4ad8f58c04c76e1f7cca5edbfca6d03 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Fri, 6 Mar 2020 10:34:26 +0800 Subject: [PATCH 140/363] Update README.md --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1ebd43cf..e8910347 100644 --- a/README.md +++ b/README.md @@ -38,10 +38,16 @@ **用户信息** - 用户id:微博用户id,如"1669879400",其实这个字段本来就是已知字段 - 昵称:用户昵称,如"Dear-迪丽热巴" +- 性别(免cookie版):微博用户性别 +- 生日(免cookie版):用户出生日期 +- 所在地(免cookie版):用户所在地 +- 大学(免cookie版):用户上学时学校的名字 +- 公司(免cookie版):用户所属公司名字 +- 阳光信用(免cookie版):用户的阳光信用 +- 微博注册时间(免cookie版):用户微博注册日期 - 微博数:用户的全部微博数(转发微博+原创微博) - 关注数:用户关注的微博数量 - 粉丝数:用户的粉丝数 -- 性别(免cookie版):微博用户性别 - 简介(免cookie版):用户简介 - 主页地址(免cookie版):微博移动版主页url,如 - 头像url(免cookie版):用户头像url From 884c04194e41e882efca19d80ce485b43ae041a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Mon, 9 Mar 2020 12:38:25 +0800 Subject: [PATCH 141/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E7=94=A8=E6=88=B7=E6=80=A7=E5=88=AB=E3=80=81=E7=94=9F?= =?UTF-8?q?=E6=97=A5=E3=80=81=E6=89=80=E5=9C=A8=E5=9C=B0=E3=80=81=E6=95=99?= =?UTF-8?q?=E8=82=B2=E7=BB=8F=E5=8E=86=E3=80=81=E5=85=AC=E5=8F=B8=E3=80=81?= =?UTF-8?q?=E8=BE=BE=E4=BA=BA=E7=AD=89=E4=BF=A1=E6=81=AF=E7=9A=84=E5=8A=9F?= =?UTF-8?q?=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 50 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 9a5a892c..5efc5fad 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -134,8 +134,8 @@ def handle_garbled(self, info): print('Error: ', e) traceback.print_exc() - def get_nickname(self): - """获取用户昵称""" + def extract_user_info(self): + """提取用户信息""" try: url = 'https://weibo.cn/%s/info' % (self.user_config['user_id']) selector = self.handle_html(url) @@ -144,7 +144,33 @@ def get_nickname(self): if nickname == u'登录 - 新' or nickname == u'新浪': self.write_log() sys.exit(u'cookie错误或已过期,请按照README中方法重新获取') - return nickname + self.user['nickname'] = nickname + basic_info = selector.xpath("//div[@class='c'][3]/text()") + zh_list = [u'性别', u'地区', u'生日', u'简介', u'认证', u'达人'] + en_list = [ + 'gender', 'location', 'birthday', 'description', + 'verified_reason', 'talent', 'education', 'company' + ] + for i in en_list: + self.user[i] = '' + for i in basic_info: + if i.split(':')[0] in zh_list: + self.user[en_list[zh_list.index( + i.split(':')[0])]] = i.split(':')[1].replace( + '\u3000', '') + if selector.xpath("//div[@class='tip'][2]/text()")[0] == u'学习经历': + self.user['education'] = selector.xpath( + "//div[@class='c'][4]/text()")[0][1:].replace( + u'\xa0', u' ') + if selector.xpath( + "//div[@class='tip'][3]/text()")[0] == u'工作经历': + self.user['work'] = selector.xpath( + "//div[@class='c'][5]/text()")[0][1:].replace( + u'\xa0', u' ') + elif selector.xpath("//div[@class='tip'][2]/text()")[0] == u'工作经历': + self.user['work'] = selector.xpath( + "//div[@class='c'][4]/text()")[0][1:].replace( + u'\xa0', u' ') except Exception as e: print('Error: ', e) traceback.print_exc() @@ -171,8 +197,16 @@ def user_to_mysql(self): # 创建'user'表 create_table = """ CREATE TABLE IF NOT EXISTS user ( - id varchar(12) NOT NULL, + id varchar(20) NOT NULL, nickname varchar(30), + gender varchar(10), + location varchar(200), + birthday varchar(40), + description varchar(140), + verified_reason varchar(140), + talent varchar(200), + education varchar(200), + work varchar(200), weibo_num INT, following INT, followers INT, @@ -211,11 +245,11 @@ def get_user_id(self, selector): break return self.user_config['user_id'] - def get_user_info(self, selector): - """获取用户id、昵称、微博数、关注数、粉丝数""" + def get_user(self, selector): + """获取用户信息、微博数、关注数、粉丝数""" try: self.user['id'] = self.get_user_id(selector) - self.user['nickname'] = self.get_nickname() # 获取用户昵称 + self.extract_user_info() # 获取用户信息 user_info = selector.xpath("//div[@class='tip2']/*/text()") weibo_num = int(user_info[0][3:-1]) following = int(user_info[1][3:-1]) @@ -1007,7 +1041,7 @@ def get_weibo_info(self): try: url = 'https://weibo.cn/%s' % (self.user_config['user_uri']) selector = self.handle_html(url) - self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 + self.get_user(selector) # 获取用户信息、微博数、关注数、粉丝数 since_date = self.str_to_time(self.user_config['since_date']) now = datetime.now().strftime('%Y-%m-%d %H:%M') now = datetime.strptime(now, '%Y-%m-%d %H:%M') From 56bc36dde5552fece61b0abdf1b8c274b8a4405b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Mon, 9 Mar 2020 12:55:51 +0800 Subject: [PATCH 142/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E7=94=A8?= =?UTF-8?q?=E6=88=B7=E4=BF=A1=E6=81=AF=E5=86=99=E5=85=A5mysql=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E5=BA=93=E5=87=BA=E9=94=99=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weiboSpider.py b/weiboSpider.py index 5efc5fad..54bc08ac 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -149,7 +149,7 @@ def extract_user_info(self): zh_list = [u'性别', u'地区', u'生日', u'简介', u'认证', u'达人'] en_list = [ 'gender', 'location', 'birthday', 'description', - 'verified_reason', 'talent', 'education', 'company' + 'verified_reason', 'talent', 'education', 'work' ] for i in en_list: self.user[i] = '' From 86f4d96304ed05297d683085c7261d5e45050d51 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 11 Mar 2020 13:46:10 +0800 Subject: [PATCH 143/363] Update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e8910347..86267f9a 100644 --- a/README.md +++ b/README.md @@ -38,11 +38,11 @@ **用户信息** - 用户id:微博用户id,如"1669879400",其实这个字段本来就是已知字段 - 昵称:用户昵称,如"Dear-迪丽热巴" -- 性别(免cookie版):微博用户性别 -- 生日(免cookie版):用户出生日期 -- 所在地(免cookie版):用户所在地 -- 大学(免cookie版):用户上学时学校的名字 -- 公司(免cookie版):用户所属公司名字 +- 性别:微博用户性别 +- 生日:用户出生日期 +- 所在地:用户所在地 +- 学习经历:用户上学时学校的名字和时间 +- 工作经历:用户所属公司名字和时间 - 阳光信用(免cookie版):用户的阳光信用 - 微博注册时间(免cookie版):用户微博注册日期 - 微博数:用户的全部微博数(转发微博+原创微博) From c08e3a8b9164983b8bef9eb77d07b3dde799c0a1 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Fri, 13 Mar 2020 19:09:50 +0800 Subject: [PATCH 144/363] Update README.md --- README.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 86267f9a..b02e061c 100644 --- a/README.md +++ b/README.md @@ -112,11 +112,19 @@ json文件包含迪丽热巴的用户信息和上千条微博信息,内容较 ``` { "user": { + "id": "1669879400", "nickname": "Dear-迪丽热巴", - "weibo_num": 1086, - "following": 248, - "followers": 65594012, - "id": "1669879400" + "gender": "女", + "location": "上海", + "birthday": "双子座", + "description": "一只喜欢默默表演的小透明。工作联系jaywalk@jaywalk.com.cn 🍒", + "verified_reason": "嘉行传媒签约演员", + "talent": "", + "education": "上海戏剧学院", + "work": "嘉行传媒 ", + "weibo_num": 1121, + "following": 250, + "followers": 66395910 }, "weibo": [ { From ac75107a357df5238a2cb1cb48fca2654ac4b309 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Tue, 17 Mar 2020 12:28:05 +0800 Subject: [PATCH 145/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96user=5Fid=5Fl?= =?UTF-8?q?ist.txt=E6=96=87=E4=BB=B6=E8=AF=BB=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加user_id去重 --- weiboSpider.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 54bc08ac..7911d842 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -63,8 +63,8 @@ def validate_config(self, config): """验证配置是否正确""" # 验证filter、pic_download、video_download - argument_lsit = ['filter', 'pic_download', 'video_download'] - for argument in argument_lsit: + argument_list = ['filter', 'pic_download', 'video_download'] + for argument in argument_list: if config[argument] != 0 and config[argument] != 1: sys.exit(u'%s值应为0或1,请重新输入' % config[argument]) @@ -1099,7 +1099,8 @@ def get_user_config_list(self, file_name): user_config['since_date'] = info[2] else: user_config['since_date'] = self.since_date - user_config_list.append(user_config) + if user_config not in user_config_list: + user_config_list.append(user_config) return user_config_list def initialize_info(self, user_config): From f70f6894d9aef349255b58a716701be8c033aca5 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 18 Mar 2020 12:56:36 +0800 Subject: [PATCH 146/363] Update README.md --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b02e061c..797af1df 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ * [定期自动爬取微博(可选)](#7定期自动爬取微博可选) * [如何获取cookie](#如何获取cookie) * [如何获取user_id](#如何获取user_id) +* [如何获取大量user_id](#如何获取大量user_id) * [注意事项](#注意事项) ## 功能 @@ -33,6 +34,7 @@ 当然,如果你只对用户信息感兴趣,而不需要爬用户的微博,也可以通过设置实现只爬取微博用户信息的功能。
程序也可以实现**爬取结果自动更新**,即:现在爬取了目标用户的微博,几天之后,目标用户可能又发新微博了。通过设置,可以实现每隔几天**增量爬取**用户这几天发的新微博。具体方法见[定期自动爬取微博](#7定期自动爬取微博可选)。
本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需[免cookie版](https://github.com/dataabc/weibo-crawler),大家可以访问,二者功能类似,免cookie版获取的信息更多,用法更简单,而且不需要cookie。
+如果想要获得**大量**微博,见[如何获取大量user_id](#如何获取大量user_id)部分。
## 输出 本部分为爬取到的字段信息说明,为了与[免cookie版](https://github.com/dataabc/weibo-crawler)区分,下面将两者爬取到的信息都列出来。如果是免cookie版所特有的信息,会有免cookie标注,没有标注的为二者共有的信息。
**用户信息** @@ -403,7 +405,11 @@ txt文件名格式可以参考[程序设置](#3程序设置)中的**设置user_i 2.按照上图箭头所指,点击"资料"链接,跳转到用户资料页面;
![](https://picture.cognize.me/cognize/github/weibospider/user_info.png) 如上图所示,迪丽热巴微博资料页的地址为"",其中的"1669879400"即为此微博的user_id。
-事实上,此微博的user_id也包含在用户主页()中,之所以我们还要点击主页中的"资料"来获取user_id,是因为很多用户的主页不是""的形式,而是""或""的形式。其中"微号"和user_id都是一串数字,如果仅仅通过主页地址提取user_id,很容易将"微号"误认为user_id。 +事实上,此微博的user_id也包含在用户主页()中,之所以我们还要点击主页中的"资料"来获取user_id,是因为很多用户的主页不是""的形式,而是""或""的形式。其中"微号"和user_id都是一串数字,如果仅仅通过主页地址提取user_id,很容易将"微号"误认为user_id。
+上述可以获得一个user_id,如果想要获得**大量**微博,见[如何获取大量user_id](#如何获取大量user_id)部分。
+ +## 如何获取大量user_id +[如何获取user_id](#如何获取user_id)部分可以获得一个user_id,可以利用这一个user_id,获取该user_id微博用户关注人的user_id,一个user_id最多可以获得200个user_id,并写入user_id_list.txt文件。程序支持读文件,利用这200个user_id,可以获得最多200X200=40000个user_id。再利用这40000个user_id可以得到40000X200=8000000个user_id,如此反复,以此类推,可以获得大量user_id。本项目也支持读文件,将上述程序的结果文件user_id_list.txt路径赋值给本项目config.json的user_id_list参数,就可以获得这些user_id用户所发布的大量微博。
## 注意事项 1.user_id不能为爬虫微博的user_id。因为要爬微博信息,必须先登录到某个微博账号,此账号我们姑且称为爬虫微博。爬虫微博访问自己的页面和访问其他用户的页面,得到的网页格式不同,所以无法爬取自己的微博信息;如果想要爬取爬虫微博内容,可以参考[获取自身微博信息](https://github.com/dataabc/weiboSpider/issues/113)。
From 39698b12f02fd3b4e52f4e34186577677753fc61 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 8 Apr 2020 13:24:44 +0800 Subject: [PATCH 147/363] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 797af1df..e02f13c9 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,8 @@ 程序也可以实现**爬取结果自动更新**,即:现在爬取了目标用户的微博,几天之后,目标用户可能又发新微博了。通过设置,可以实现每隔几天**增量爬取**用户这几天发的新微博。具体方法见[定期自动爬取微博](#7定期自动爬取微博可选)。
本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需[免cookie版](https://github.com/dataabc/weibo-crawler),大家可以访问,二者功能类似,免cookie版获取的信息更多,用法更简单,而且不需要cookie。
如果想要获得**大量**微博,见[如何获取大量user_id](#如何获取大量user_id)部分。
+
+另外,推荐下另一个程序[weibo-search](https://github.com/dataabc/weibo-search)。该程序可以连续获取一个或多个**微博关键词搜索**结果,并将结果写入文件(可选)、数据库(可选)等。所谓微博关键词搜索即:**搜索正文中包含指定关键词的微博**,可以指定搜索的时间范围。对于非常热门的关键词,一天的时间范围,可以获得**1000万**以上的搜索结果,N天的时间范围就可以获得1000万 X N搜索结果。对于大多数关键词,一天产生的相应微博数量应该在1000万条以下,因此可以说该程序可以获得大部分关键词的全部或近似全部的搜索结果。而且该程序可以获得搜索结果的所有信息,本程序获得的微博信息该程序都能获得。
## 输出 本部分为爬取到的字段信息说明,为了与[免cookie版](https://github.com/dataabc/weibo-crawler)区分,下面将两者爬取到的信息都列出来。如果是免cookie版所特有的信息,会有免cookie标注,没有标注的为二者共有的信息。
**用户信息** From 3b8727eaf6d1f65ad8c224b5ffd025812beb1546 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Wed, 8 Apr 2020 18:56:12 +0800 Subject: [PATCH 148/363] =?UTF-8?q?feat:=20=E8=BF=87=E6=BB=A4=E5=BE=AE?= =?UTF-8?q?=E5=8D=9A=E6=97=B6=E6=B7=BB=E5=8A=A0=E8=BF=87=E6=BB=A4=E6=8F=90?= =?UTF-8?q?=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 如果没有过滤提示,在获取原创微博时,遇到连续的转发微博,进度条不变,会给人程序卡死的错觉。添加过滤提示,可以告诉使用者程序正在正常运行,正在过滤转发微博 Issue #140 --- weiboSpider.py | 1 + 1 file changed, 1 insertion(+) diff --git a/weiboSpider.py b/weiboSpider.py index 7911d842..80fe24fc 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -650,6 +650,7 @@ def get_one_weibo(self, info): weibo['comment_num'] = footer['comment_num'] # 评论数 else: weibo = None + print(u'正在过滤转发微博') return weibo except Exception as e: print('Error: ', e) From 474be7bbcedfb951a85aea98760804c67941650a Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 22 Apr 2020 12:53:12 +0800 Subject: [PATCH 149/363] Update README.md --- README.md | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index e02f13c9..33f824c2 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ - 微博数:用户的全部微博数(转发微博+原创微博) - 关注数:用户关注的微博数量 - 粉丝数:用户的粉丝数 -- 简介(免cookie版):用户简介 +- 简介:用户简介 - 主页地址(免cookie版):微博移动版主页url,如 - 头像url(免cookie版):用户头像url - 高清头像url(免cookie版):用户高清头像url @@ -60,7 +60,7 @@ - 会员等级(免cookie版):微博会员用户等级,普通用户该等级为0 - 是否认证(免cookie版):用户是否认证,为布尔类型 - 认证类型(免cookie版):用户认证类型,如个人认证、企业认证、政府认证等 -- 认证信息(免cookie版):为认证用户特有,用户信息栏显示的认证信息 +- 认证信息:为认证用户特有,用户信息栏显示的认证信息 *** **微博信息** - 微博id:微博唯一标志 @@ -286,6 +286,14 @@ MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为 **user表**
**id**:存储用户id,如"1669879400";
**nickname**:存储用户昵称,如"Dear-迪丽热巴";
+**gender**:存储用户性别;
+**location**:存储用户所在地;
+**birthday**:存储用户出生日期;
+**description**:存储用户简介;
+**verified_reason**:存储用户认证;
+**talent**:存储用户标签;
+**education**:存储用户学习经历;
+**work**:存储用户工作经历;
**weibo_num**:存储微博数;
**following**:存储关注数;
**followers**:存储粉丝数。
@@ -326,11 +334,25 @@ $ python weibospider.py wb.start() # 爬取微博信息 ``` 用户可以按照自己的需求调用或修改Weibo类。
-通过执行本程序,我们可以得到很多信息:
-**wb.nickname**:用户昵称;
-**wb.weibo_num**:微博数;
-**wb.following**:关注数;
-**wb.followers**:粉丝数;
+通过执行本程序,我们可以得到很多信息
+
+ +点击查看详情 + +**wb.user['nickname']**:用户昵称;
+**wb.user['gender']**:用户性别;
+**wb.user['location']**:用户所在地;
+**wb.user['birthday']**:用户出生日期;
+**wb.user['description']**:用户简介;
+**wb.user['verified_reason']**:用户认证;
+**wb.user['talent']**:用户标签;
+**wb.user['education']**:用户学习经历;
+**wb.user['work']**:用户工作经历;
+**wb.user['weibo_num']**:微博数;
+**wb.user['following']**:关注数;
+**wb.user['followers']**:粉丝数;
+
+ **wb.weibo**:除不包含上述信息外,wb.weibo包含爬取到的所有微博信息,如**微博id**、**微博正文**、**原始图片url**、**发布位置**、**发布时间**、**发布工具**、**点赞数**、**转发数**、**评论数**等。如果爬的是全部微博(原创+转发),除上述信息之外,还包含被**转发微博原始图片url**、**是否为原创微博**等。wb.weibo是一个列表,包含了爬取的所有微博信息。wb.weibo[0]为爬取的第一条微博,wb.weibo[1]为爬取的第二条微博,以此类推。当filter=1时,wb.weibo[0]为爬取的第一条**原创**微博,以此类推。wb.weibo[0]['id']为第一条微博的id,wb.weibo[0]['content']为第一条微博的正文,wb.weibo[0]['publish_time']为第一条微博的发布时间,还有其它很多信息不在赘述,大家可以点击下面的"详情"查看具体用法。
From 0abd4748ddf46d7972ad16d2020d73a7f99368fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Fri, 24 Apr 2020 20:04:02 +0800 Subject: [PATCH 150/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=BD=93?= =?UTF-8?q?=E5=9B=BE=E7=89=87url=E4=B8=8D=E4=BB=A5=E5=9B=BE=E7=89=87?= =?UTF-8?q?=E7=B1=BB=E5=9E=8B=E7=BB=93=E5=B0=BE=E6=97=B6=E5=87=BA=E9=94=99?= =?UTF-8?q?=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 80fe24fc..4c936656 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -588,12 +588,20 @@ def handle_download(self, file_type, file_dir, urls, w): if ',' in urls: url_list = urls.split(',') for i, url in enumerate(url_list): - file_suffix = url[url.rfind('.'):] + index = url.rfind('.') + if len(url) - index >= 5: + file_suffix = '.jpg' + else: + file_suffix = url[index:] file_name = file_prefix + '_' + str(i + 1) + file_suffix file_path = file_dir + os.sep + file_name self.download_one_file(url, file_path, file_type, w['id']) else: - file_suffix = urls[urls.rfind('.'):] + index = urls.rfind('.') + if len(urls) - index > 5: + file_suffix = '.jpg' + else: + file_suffix = urls[index:] file_name = file_prefix + file_suffix file_path = file_dir + os.sep + file_name self.download_one_file(urls, file_path, file_type, w['id']) From 81f1f61e263251dc664a674c818b79af6e216686 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cdataabc?= Date: Thu, 30 Apr 2020 21:28:11 +0800 Subject: [PATCH 151/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E5=BE=AE=E5=8D=9A=E4=B8=AD=E5=A4=B4=E6=9D=A1=E6=96=87?= =?UTF-8?q?=E7=AB=A0url=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weiboSpider.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 4c936656..414dfc1f 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -373,6 +373,16 @@ def get_weibo_content(self, info, is_original): print('Error: ', e) traceback.print_exc() + def get_article_url(self, info): + """获取微博头条文章的url""" + article_url = '' + text = self.handle_garbled(info) + if text.startswith(u'发布了头条文章'): + url = info.xpath('.//a/@href') + if url and url[0].startswith('https://weibo.cn/sinaurl'): + article_url = url[0] + return article_url + def get_publish_place(self, info): """获取微博发布位置""" try: @@ -640,6 +650,7 @@ def get_one_weibo(self, info): weibo['id'] = info.xpath('@id')[0][2:] weibo['content'] = self.get_weibo_content(info, is_original) # 微博内容 + weibo['article_url'] = self.get_article_url(info) # 头条文章url picture_urls = self.get_picture_urls(info, is_original) weibo['original_pictures'] = picture_urls[ 'original_pictures'] # 原创图片url @@ -757,6 +768,7 @@ def write_csv(self, wrote_num): result_headers = [ '微博id', '微博正文', + '头条文章url', '原始图片url', '微博视频url', '发布位置', @@ -767,8 +779,8 @@ def write_csv(self, wrote_num): '评论数', ] if not self.filter: - result_headers.insert(3, '被转发微博原始图片url') - result_headers.insert(4, '是否为原创微博') + result_headers.insert(4, '被转发微博原始图片url') + result_headers.insert(5, '是否为原创微博') result_data = [w.values() for w in self.weibo[wrote_num:]] if sys.version < '3': # python2.x reload(sys) @@ -978,6 +990,7 @@ def weibo_to_mysql(self, wrote_num): id varchar(10) NOT NULL, user_id varchar(12), content varchar(2000), + article_url varchar(200), original_pictures varchar(3000), retweet_pictures varchar(3000), original BOOLEAN NOT NULL DEFAULT 1, From 304e082b72a92aa389c64d41e186b49dd003d662 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Thu, 30 Apr 2020 21:58:38 +0800 Subject: [PATCH 152/363] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 33f824c2..6f3f2fa8 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,7 @@ **微博信息** - 微博id:微博唯一标志 - 微博内容:微博正文 +- 头条文章url:微博中头条文章的url,若微博中不存在头条文章,则值为'' - 原始图片url:原创微博图片和转发微博转发理由中图片的url,若某条微博存在多张图片,每个url以英文逗号分隔,若没有图片则值为"无" - 视频url: 微博中的视频url,若微博中没有视频,则值为"无" - 微博发布位置:位置微博中的发布位置 @@ -302,6 +303,7 @@ MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为 **id**:存储微博id;
**user_id**:存储微博发布者的用户id,如"1669879400";
**content**:存储微博正文;
+**article_url**:存储微博中头条文章的url,若微博中不存在头条文章,则值为'';
**original_pictures**:存储原创微博的原始图片url和转发微博转发理由中的图片url。若某条微博有多张图片,则存储多个url,以英文逗号分割;若某微博没有图片,则值为"无";
**retweet_pictures**:存储被转发微博中的原始图片url。当最新微博为原创微博或者为没有图片的转发微博时,则值为"无",否则为被转发微博的图片url。若有多张图片,则存储多个url,以英文逗号分割;
**publish_place**:存储微博的发布位置。如果某条微博没有位置信息,则值为"无";
@@ -361,6 +363,7 @@ $ python weibospider.py 若目标微博用户存在微博,则:
**id**:存储微博id。如wb.weibo[0]['id']为最新一条微博的id;
**content**:存储微博正文。如wb.weibo[0]['content']为最新一条微博的正文;
+**article_url**:存储微博中头条文章的url。如wb.weibo[0]['article_url']为最新一条微博的头条文章url,若微博中不存在头条文章,则值为'';
**original_pictures**:存储原创微博的原始图片url和转发微博转发理由中的图片url。如wb.weibo[0]['original_pictures']为最新一条微博的原始图片url,若该条微博有多张图片,则存储多个url,以英文逗号分割;若该微博没有图片,则值为"无";
**retweet_pictures**:存储被转发微博中的原始图片url。当最新微博为原创微博或者为没有图片的转发微博时,则值为"无",否则为被转发微博的图片url。若有多张图片,则存储多个url,以英文逗号分割;
**publish_place**:存储微博的发布位置。如wb.weibo[0]['publish_place']为最新一条微博的发布位置,如果该条微博没有位置信息,则值为"无";
From 18806b29a01d75cfe7c89fd1484f11ca56e5cf4a Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Tue, 26 May 2020 23:59:21 +0800 Subject: [PATCH 153/363] fix data loss in extract_user_info for description --- weiboSpider.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/weiboSpider.py b/weiboSpider.py index 414dfc1f..7591cd58 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -154,9 +154,9 @@ def extract_user_info(self): for i in en_list: self.user[i] = '' for i in basic_info: - if i.split(':')[0] in zh_list: + if i.split(':', 1)[0] in zh_list: self.user[en_list[zh_list.index( - i.split(':')[0])]] = i.split(':')[1].replace( + i.split(':', 1)[0])]] = i.split(':', 1)[1].replace( '\u3000', '') if selector.xpath("//div[@class='tip'][2]/text()")[0] == u'学习经历': self.user['education'] = selector.xpath( From 664019b38e27220e8e9c36edbccface53698e47a Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Fri, 29 May 2020 01:30:09 +0800 Subject: [PATCH 154/363] package up --- .gitignore | 6 +++++ LICENSE | 9 +++++++ setup.py | 27 +++++++++++++++++++ weibo_spider/__init__.py | 0 weibo_spider/__main__.py | 3 +++ weiboSpider.py => weibo_spider/weiboSpider.py | 0 6 files changed, 45 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 setup.py create mode 100644 weibo_spider/__init__.py create mode 100644 weibo_spider/__main__.py rename weiboSpider.py => weibo_spider/weiboSpider.py (100%) diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..248a3f15 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +*.pyc +__pycache__ + +build +dist +*.egg-info diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..ced342bb --- /dev/null +++ b/LICENSE @@ -0,0 +1,9 @@ +Copyright 2020 dataabc + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..f98dcf69 --- /dev/null +++ b/setup.py @@ -0,0 +1,27 @@ +import setuptools + +with open("README.md", "r") as fh: + long_description = fh.read() + +setuptools.setup( + name="weibo-spider", # Replace with your own username + version="0.0.2", + author="Chen Lei", + author_email="chillychen1991@gmail.com", + description="新浪微博爬虫,用python爬取新浪微博数据。", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/dataabc/weiboSpider", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + ], + install_requires=[ + 'lxml', + 'requests', + 'tqdm', + ], + python_requires='>=3.6', +) diff --git a/weibo_spider/__init__.py b/weibo_spider/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/weibo_spider/__main__.py b/weibo_spider/__main__.py new file mode 100644 index 00000000..cadb88c0 --- /dev/null +++ b/weibo_spider/__main__.py @@ -0,0 +1,3 @@ +from .weiboSpider import main + +main() diff --git a/weiboSpider.py b/weibo_spider/weiboSpider.py similarity index 100% rename from weiboSpider.py rename to weibo_spider/weiboSpider.py From 3e5f7d7590cab22e265f9eb3a36fc7faec6a19d2 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Sat, 30 May 2020 01:07:05 +0800 Subject: [PATCH 155/363] =?UTF-8?q?1.=20=E5=88=A0=E9=99=A4=20LICENSE=202.?= =?UTF-8?q?=20=E4=BD=BF=E7=94=A8=20absl=20=E6=9D=A5=E9=85=8D=E7=BD=AE=20FL?= =?UTF-8?q?AGS.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 +++ LICENSE | 9 --------- requirements.txt | 1 + setup.py | 8 ++++---- weibo_spider/__main__.py | 4 +++- config.json => weibo_spider/config.json | 0 weibo_spider/weiboSpider.py | 18 ++++++++++++++---- 7 files changed, 25 insertions(+), 18 deletions(-) delete mode 100644 LICENSE rename config.json => weibo_spider/config.json (100%) diff --git a/.gitignore b/.gitignore index 248a3f15..c25fb50b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +.vscode + +weibo/ *.pyc __pycache__ diff --git a/LICENSE b/LICENSE deleted file mode 100644 index ced342bb..00000000 --- a/LICENSE +++ /dev/null @@ -1,9 +0,0 @@ -Copyright 2020 dataabc - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/requirements.txt b/requirements.txt index d0747725..806d086b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ lxml==4.3.4 requests==2.22.0 tqdm==4.32.2 +absl-py==0.9.0 \ No newline at end of file diff --git a/setup.py b/setup.py index f98dcf69..25e34a9a 100644 --- a/setup.py +++ b/setup.py @@ -4,8 +4,8 @@ long_description = fh.read() setuptools.setup( - name="weibo-spider", # Replace with your own username - version="0.0.2", + name="weibo-spider", + version="0.0.4", author="Chen Lei", author_email="chillychen1991@gmail.com", description="新浪微博爬虫,用python爬取新浪微博数据。", @@ -15,13 +15,13 @@ packages=setuptools.find_packages(), classifiers=[ "Programming Language :: Python :: 3", - "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", ], install_requires=[ 'lxml', 'requests', - 'tqdm', + 'tqdm', + 'absl-py', ], python_requires='>=3.6', ) diff --git a/weibo_spider/__main__.py b/weibo_spider/__main__.py index cadb88c0..f562cfae 100644 --- a/weibo_spider/__main__.py +++ b/weibo_spider/__main__.py @@ -1,3 +1,5 @@ from .weiboSpider import main -main() +from absl import app + +app.run(main) diff --git a/config.json b/weibo_spider/config.json similarity index 100% rename from config.json rename to weibo_spider/config.json diff --git a/weibo_spider/weiboSpider.py b/weibo_spider/weiboSpider.py index 7591cd58..8ab19fc5 100644 --- a/weibo_spider/weiboSpider.py +++ b/weibo_spider/weiboSpider.py @@ -18,6 +18,11 @@ from lxml import etree from requests.adapters import HTTPAdapter from tqdm import tqdm +from absl import app, flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string('config_path', None, 'The path to config.json.') class Weibo(object): @@ -1149,10 +1154,15 @@ def start(self): traceback.print_exc() -def main(): +def main(argv): + del argv # useless + try: - config_path = os.path.split( - os.path.realpath(__file__))[0] + os.sep + 'config.json' + if FLAGS.config_path is not None: + config_path = FLAGS.config_path + else: + config_path = os.path.split( + os.path.realpath(__file__))[0] + os.sep + 'config.json' if not os.path.isfile(config_path): sys.exit(u'当前路径:%s 不存在配置文件config.json' % (os.path.split(os.path.realpath(__file__))[0] + os.sep)) @@ -1170,4 +1180,4 @@ def main(): if __name__ == '__main__': - main() + app.run(main) From dfce2498301086e4d37df99550ccfbe7c03a0fbb Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sun, 31 May 2020 02:28:56 +0800 Subject: [PATCH 156/363] Update README.md --- README.md | 47 +++++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 6f3f2fa8..75d43711 100644 --- a/README.md +++ b/README.md @@ -3,14 +3,6 @@ * [实例](#实例) * [运行环境](#运行环境) * [使用说明](#使用说明) - * [版本](#0版本) - * [下载脚本](#1下载脚本) - * [安装依赖](#2安装依赖) - * [程序设置](#3程序设置) - * [设置数据库(可选)](#4设置数据库可选) - * [运行脚本](#5运行脚本) - * [按需求修改脚本(可选)](#6按需求修改脚本可选) - * [定期自动爬取微博(可选)](#7定期自动爬取微博可选) * [如何获取cookie](#如何获取cookie) * [如何获取user_id](#如何获取user_id) * [如何获取大量user_id](#如何获取大量user_id) @@ -100,9 +92,9 @@ 对于上述参数的含义以及取值范围,这里仅作简单介绍,详细信息见[程序设置](#3程序设置)。 >**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](#如何获取user_id);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件、csv文件和json文件,所以它的值为["csv", "txt", "json"],如果你想写入数据库,具体设置见[设置数据库](#4设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[如何获取cookie](#如何获取cookie),获取cookie后把"your cookie"替换成真实的cookie值即可。
-cookie修改完成后运行程序: +cookie修改完成后运行weiboSpider.py,该文件位于weibospider=>weibo_spider: ```bash -$ python weibospider.py +$ python weiboSpider.py ``` 程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个json文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](#4设置数据库可选)部分。

@@ -182,16 +174,23 @@ json文件包含迪丽热巴的用户信息和上千条微博信息,内容较 多文件版由[songzy12](https://github.com/songzy12)重构。songzy12非常认真负责,对于我发现的问题都很耐心地修复了,而且效率非常高,在此感谢。
本使用说明是单文件版的使用说明。 ### 1.下载脚本 +本程序提供两种下载方式,一种是**源码下载安装**,另一种是**pip安装**,二者功能完全相同。如果你需要修改源码,建议使用第一种方式,否则选哪种安装方式都可以。
+**源码下载安装**
+下载脚本 ```bash $ git clone https://github.com/dataabc/weibospider.git ``` -运行上述命令,将本项目下载到当前目录,如果下载成功当前目录会出现一个名为"weibospider"的文件夹; -### 2.安装依赖 +安装依赖 ```bash $ pip install -r requirements.txt ``` -### 3.程序设置 -打开**config.json**文件,你会看到如下内容: +运行上述命令,将本项目下载到当前目录,如果下载成功当前目录会出现一个名为"weibospider"的文件夹;
+**pip安装** +```bash +$ python -m pip install --index-url https://pypi.org/simple weibo-spider +``` +### 2.程序设置 +如果你使用的是**源码下载安装**,请打开**config.json**文件,你会看到如下内容: ``` { "user_id_list": ["1669879400"], @@ -210,6 +209,7 @@ $ pip install -r requirements.txt } } ``` +如果你使用的是**pip安装**,你需要在任意目录下,创建上面的config.json文件。
下面讲解每个参数的含义与设置方法。
**设置user_id_list**
user_id_list是我们要爬取的微博的id,可以是一个,也可以是多个,例如: @@ -268,7 +268,7 @@ video_download控制是否下载微博中的视频,值为1代表下载,值 **设置mysql_config(可选)**
mysql_config控制mysql参数配置。如果你不需要将结果信息写入mysql,这个参数可以忽略,即删除或保留都无所谓;如果你需要写入mysql且config.json文件中mysql_config的配置与你的mysql配置不一样,请将该值改成你自己mysql中的参数配置。 -### 4.设置数据库(可选) +### 3.设置数据库(可选) 本部分是可选部分,如果不需要将爬取信息写入数据库,可跳过这一步。本程序目前支持MySQL数据库和MongoDB数据库,如果你需要写入其它数据库,可以参考这两个数据库的写法自己编写。
**MySQL数据库写入**
要想将爬取信息写入MySQL,请根据自己的系统环境安装MySQL,然后命令行执行: @@ -315,13 +315,16 @@ MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为
-### 5.运行脚本 -大家可以根据自己的运行环境选择运行方式,Linux可以通过 +### 4.运行脚本 +**源码下载安装**的用户可以在weiboSpider.py文件所在目录下运行 +```bash +$ python weiboSpider.py +``` +**pip安装**的用户可以在config.json文件所在目录运行 ```bash -$ python weibospider.py +$ python -m weibo_spider --config_path="config.json" ``` -运行; -### 6.按需求修改脚本(可选) +### 5.按需求修改脚本(可选) 本部分为可选部分,如果你不需要自己修改代码或添加新功能,可以忽略此部分。
本程序所有代码都位于weiboSpider.py文件,程序主体是一个Weibo类,上述所有功能都是通过在main函数调用Weibo类实现的,默认的调用代码如下: ```python @@ -375,7 +378,7 @@ $ python weibospider.py -### 7.定期自动爬取微博(可选) +### 6.定期自动爬取微博(可选) 我们爬取了微博以后,很多微博账号又可能发了一些新微博,定期自动爬取微博就是每隔一段时间自动运行程序,自动爬取这段时间产生的新微博(忽略以前爬过的旧微博)。本部分为可选部分,如果不需要可以忽略。
思路是**利用第三方软件,如crontab,让程序每隔一段时间运行一次**。因为是要跳过以前爬过的旧微博,只爬新微博。所以需要**设置一个动态的since_date**。很多时候我们使用的since_date是固定的,比如since_date="2018-01-01",程序就会按照这个设置从最新的微博一直爬到发布时间为2018-01-01的微博(包括这个时间)。因为我们想追加新微博,跳过旧微博。第二次爬取时since_date值就应该是当前时间到上次爬取的时间。 如果我们使用最原始的方式实现追加爬取,应该是这样: @@ -384,7 +387,7 @@ $ python weibospider.py 第二次爬取,我们想要接着上次的爬,那since_date的值应该是上次程序执行的日期,即2019-06-06 ``` 上面的方法太麻烦,因为每次都要手动设置since_date。因此我们需要动态设置since_date,即程序根据实际情况,自动生成since_date。
-有两种方法实现动态更新since_date:
+有两种方法实现动态更新since_date,**推荐使用方法二**:
**方法一:将since_date设置成整数** 将config.json文件中的since_date设置成整数,如: ``` From 1bb19a0145cd2b7b293a242642f785106b63227e Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sun, 31 May 2020 02:34:57 +0800 Subject: [PATCH 157/363] Update README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 75d43711..5f6d5147 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ * [实例](#实例) * [运行环境](#运行环境) * [使用说明](#使用说明) +* [定期自动爬取微博(可选)](#定期自动爬取微博可选) * [如何获取cookie](#如何获取cookie) * [如何获取user_id](#如何获取user_id) * [如何获取大量user_id](#如何获取大量user_id) @@ -24,7 +25,7 @@ - 下载用户**转发**微博**Live Photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有)
当然,如果你只对用户信息感兴趣,而不需要爬用户的微博,也可以通过设置实现只爬取微博用户信息的功能。
-程序也可以实现**爬取结果自动更新**,即:现在爬取了目标用户的微博,几天之后,目标用户可能又发新微博了。通过设置,可以实现每隔几天**增量爬取**用户这几天发的新微博。具体方法见[定期自动爬取微博](#7定期自动爬取微博可选)。
+程序也可以实现**爬取结果自动更新**,即:现在爬取了目标用户的微博,几天之后,目标用户可能又发新微博了。通过设置,可以实现每隔几天**增量爬取**用户这几天发的新微博。具体方法见[定期自动爬取微博](#定期自动爬取微博可选)。
本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需[免cookie版](https://github.com/dataabc/weibo-crawler),大家可以访问,二者功能类似,免cookie版获取的信息更多,用法更简单,而且不需要cookie。
如果想要获得**大量**微博,见[如何获取大量user_id](#如何获取大量user_id)部分。

@@ -378,7 +379,7 @@ $ python -m weibo_spider --config_path="config.json" -### 6.定期自动爬取微博(可选) +## 定期自动爬取微博(可选) 我们爬取了微博以后,很多微博账号又可能发了一些新微博,定期自动爬取微博就是每隔一段时间自动运行程序,自动爬取这段时间产生的新微博(忽略以前爬过的旧微博)。本部分为可选部分,如果不需要可以忽略。
思路是**利用第三方软件,如crontab,让程序每隔一段时间运行一次**。因为是要跳过以前爬过的旧微博,只爬新微博。所以需要**设置一个动态的since_date**。很多时候我们使用的since_date是固定的,比如since_date="2018-01-01",程序就会按照这个设置从最新的微博一直爬到发布时间为2018-01-01的微博(包括这个时间)。因为我们想追加新微博,跳过旧微博。第二次爬取时since_date值就应该是当前时间到上次爬取的时间。 如果我们使用最原始的方式实现追加爬取,应该是这样: From 7e7400c011ca567980c3385f2d4483d923778c8a Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sun, 31 May 2020 13:26:37 +0800 Subject: [PATCH 158/363] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5f6d5147..7d7dc5c8 100644 --- a/README.md +++ b/README.md @@ -188,7 +188,7 @@ $ pip install -r requirements.txt 运行上述命令,将本项目下载到当前目录,如果下载成功当前目录会出现一个名为"weibospider"的文件夹;
**pip安装** ```bash -$ python -m pip install --index-url https://pypi.org/simple weibo-spider +$ python -m pip install weibo-spider ``` ### 2.程序设置 如果你使用的是**源码下载安装**,请打开**config.json**文件,你会看到如下内容: From 48bbc0c470e35dc37e09b513022e8890cec3d2af Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sun, 31 May 2020 18:19:26 +0800 Subject: [PATCH 159/363] Update README.md --- README.md | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 7d7dc5c8..bbd7676e 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ - 关注数:用户关注的微博数量 - 粉丝数:用户的粉丝数 - 简介:用户简介 -- 主页地址(免cookie版):微博移动版主页url,如 +- 主页地址(免cookie版):微博移动版主页url - 头像url(免cookie版):用户头像url - 高清头像url(免cookie版):用户高清头像url - 微博等级(免cookie版):用户微博等级 @@ -97,7 +97,7 @@ cookie修改完成后运行weiboSpider.py,该文件位于weibospider=>weibo_sp ```bash $ python weiboSpider.py ``` -程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个json文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](#4设置数据库可选)部分。
+程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个json文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](#3设置数据库可选)部分。

**csv结果文件如下所示:** ![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv*
@@ -161,7 +161,7 @@ json文件包含迪丽热巴的用户信息和上千条微博信息,内容较 **下载的视频如下所示:** ![](https://picture.cognize.me/cognize/github/weibospider/video.png)*video文件夹*
本次下载了70个视频,是她原创微博中的视频,视频名为yyyymmdd+微博id的形式。其中有一个视频因为网络原因下载失败,程序将它的微博id和视频url以“weibo_id:video_url”的形式写到了同文件夹下的not_downloaded.txt里。
-因为我本地没有安装MySQL数据库和MongoDB数据库,所以暂时设置成不写入数据库。如果你想要将爬取结果写入数据库,只需要先安装数据库(MySQL或MongoDB),再安装对应包(pymysql或pymongo),然后将mysql_write或mongodb_write值设置为1即可。写入MySQL需要用户名、密码等配置信息,这些配置如何设置见[设置数据库](#4设置数据库可选)部分。 +因为我本地没有安装MySQL数据库和MongoDB数据库,所以暂时设置成不写入数据库。如果你想要将爬取结果写入数据库,只需要先安装数据库(MySQL或MongoDB),再安装对应包(pymysql或pymongo),然后将mysql_write或mongodb_write值设置为1即可。写入MySQL需要用户名、密码等配置信息,这些配置如何设置见[设置数据库](#3设置数据库可选)部分。 ## 运行环境 - 开发语言:python2/python3 - 系统: Windows/Linux/macOS @@ -218,7 +218,7 @@ user_id_list是我们要爬取的微博的id,可以是一个,也可以是多 "user_id_list": ["1223178222", "1669879400", "1729370543"], ``` 上述代码代表我们要连续爬取user_id分别为“1223178222”、 “1669879400”、 “1729370543”的三个用户的微博,具体如何获取user_id见[如何获取user_id](#如何获取user_id)。
-user_id_list的值也可以是文件路径,我们可以把要爬的所有微博用户的user_id都写到txt文件里,然后把文件的位置路径赋值给user_id_list。
+user_id_list的值也可以是文件路径,我们可以把要爬的所有微博用户的user_id都写到txt文件里,然后把文件的位置路径赋值给user_id_list,**推荐这种方式**。
在txt文件中,每个user_id占一行,也可以在user_id后面加注释(可选),如用户昵称等信息,user_id和注释之间必需要有空格,文件名任意,类型为txt,位置位于本程序的同目录下,文件内容示例如下: ``` 1223178222 胡歌 @@ -245,13 +245,13 @@ since_date值可以是日期,也可以是整数。如果是日期,代表爬 "since_date": 10, ``` 代表爬取最近10天的微博,这个说法不是特别准确,准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。
-**since_date是所有user的爬取起始时间,非常不灵活。如果你要爬多个用户,并且想单独为每个用户设置一个since_date,可以使用[定期自动爬取微博](#7定期自动爬取微博可选)方法二中的方法,该方法可以为多个用户设置不同的since_date,非常灵活。**
+**since_date是所有user的爬取起始时间,非常不灵活。如果你要爬多个用户,并且想单独为每个用户设置一个since_date,可以使用[定期自动爬取微博](#定期自动爬取微博可选)方法二中的方法,该方法可以为多个用户设置不同的since_date,非常灵活。**
**设置write_mode**
write_mode控制结果文件格式,取值范围是csv、txt、json、mongo和mysql,分别代表将结果文件写入csv、txt、json、MongoDB和MySQL数据库。write_mode可以同时包含这些取值中的一个或几个,如: ``` "write_mode": ["csv", "txt"], ``` -代表将结果信息写入csv文件和txt文件。特别注意,如果你想写入数据库,除了在write_mode添加对应数据库的名字外,还应该安装相关数据库和对应python模块,具体操作见[设置数据库](#4设置数据库可选)部分。
+代表将结果信息写入csv文件和txt文件。特别注意,如果你想写入数据库,除了在write_mode添加对应数据库的名字外,还应该安装相关数据库和对应python模块,具体操作见[设置数据库](#3设置数据库可选)部分。
**设置pic_download**
pic_download控制是否下载微博中的图片,值为1代表下载,值为0代表不下载,如 ``` @@ -388,16 +388,15 @@ $ python -m weibo_spider --config_path="config.json" 第二次爬取,我们想要接着上次的爬,那since_date的值应该是上次程序执行的日期,即2019-06-06 ``` 上面的方法太麻烦,因为每次都要手动设置since_date。因此我们需要动态设置since_date,即程序根据实际情况,自动生成since_date。
-有两种方法实现动态更新since_date,**推荐使用方法二**:
-**方法一:将since_date设置成整数** +有两种方法实现动态更新since_date,**推荐使用方法二**。
+**方法一:将since_date设置成整数**
将config.json文件中的since_date设置成整数,如: ``` "since_date": 10, ``` 这个配置告诉程序爬取最近10天的微博,更准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。这样since_date就是一个动态的变量,每次程序执行时,它的值就是当前日期减10。配合crontab每9天或10天执行一次,就实现了定期追加爬取。
**方法二:将上次执行程序的时间写入文件(推荐)**
-这个方法很简单,就是用户把要爬的用户id写入txt文件,然后再把文件路径赋值给config.json中的user_id_list参数。
-txt文件名格式可以参考[程序设置](#3程序设置)中的**设置user_id_list**,这样设置就全部结束了。
+这个方法很简单,就是使用[程序设置](#2程序设置)中**设置user_id_list**的第二种方法设置user_id_list,这样设置就全部结束了。
说下这个方法的好处和原理,假如你的txt文件内容为: ``` 1669879400 From 2f054e432b8d6c87c183be000ee2bdf954c8d78f Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sun, 31 May 2020 18:33:39 +0800 Subject: [PATCH 160/363] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bbd7676e..33142e5c 100644 --- a/README.md +++ b/README.md @@ -90,8 +90,8 @@ } ``` -对于上述参数的含义以及取值范围,这里仅作简单介绍,详细信息见[程序设置](#3程序设置)。 ->**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](#如何获取user_id);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件、csv文件和json文件,所以它的值为["csv", "txt", "json"],如果你想写入数据库,具体设置见[设置数据库](#4设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[如何获取cookie](#如何获取cookie),获取cookie后把"your cookie"替换成真实的cookie值即可。
+对于上述参数的含义以及取值范围,这里仅作简单介绍,详细信息见[程序设置](#2程序设置)。 +>**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](#如何获取user_id);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件、csv文件和json文件,所以它的值为["csv", "txt", "json"],如果你想写入数据库,具体设置见[设置数据库](#3设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[如何获取cookie](#如何获取cookie),获取cookie后把"your cookie"替换成真实的cookie值即可。
cookie修改完成后运行weiboSpider.py,该文件位于weibospider=>weibo_spider: ```bash From 48a10b024210cb35005d13eb1ae1a174d167fb31 Mon Sep 17 00:00:00 2001 From: dataabc Date: Mon, 1 Jun 2020 20:24:18 +0800 Subject: [PATCH 161/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96config.json?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E8=AF=BB=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setup.py | 22 +++++++++---------- weibo_spider/weiboSpider.py | 43 +++++++++++++++++++++---------------- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/setup.py b/setup.py index 25e34a9a..07809240 100644 --- a/setup.py +++ b/setup.py @@ -1,27 +1,27 @@ import setuptools -with open("README.md", "r") as fh: +with open('README.md', 'r', encoding='utf-8') as fh: long_description = fh.read() setuptools.setup( - name="weibo-spider", - version="0.0.4", - author="Chen Lei", - author_email="chillychen1991@gmail.com", - description="新浪微博爬虫,用python爬取新浪微博数据。", + name='weibo-spider', + version='0.0.5', + author='Chen Lei', + author_email='chillychen1991@gmail.com', + description='新浪微博爬虫,用python爬取新浪微博数据。', long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/dataabc/weiboSpider", + long_description_content_type='text/markdown', + url='https://github.com/dataabc/weiboSpider', packages=setuptools.find_packages(), classifiers=[ - "Programming Language :: Python :: 3", - "Operating System :: OS Independent", + 'Programming Language :: Python :: 3', + 'Operating System :: OS Independent', ], install_requires=[ + 'absl-py', 'lxml', 'requests', 'tqdm', - 'absl-py', ], python_requires='>=3.6', ) diff --git a/weibo_spider/weiboSpider.py b/weibo_spider/weiboSpider.py index 8ab19fc5..50492251 100644 --- a/weibo_spider/weiboSpider.py +++ b/weibo_spider/weiboSpider.py @@ -8,6 +8,7 @@ import os import random import re +import shutil import sys import traceback from collections import OrderedDict @@ -15,10 +16,10 @@ from time import sleep import requests +from absl import app, flags from lxml import etree from requests.adapters import HTTPAdapter from tqdm import tqdm -from absl import app, flags FLAGS = flags.FLAGS @@ -160,9 +161,9 @@ def extract_user_info(self): self.user[i] = '' for i in basic_info: if i.split(':', 1)[0] in zh_list: - self.user[en_list[zh_list.index( - i.split(':', 1)[0])]] = i.split(':', 1)[1].replace( - '\u3000', '') + self.user[en_list[zh_list.index(i.split( + ':', 1)[0])]] = i.split(':', + 1)[1].replace('\u3000', '') if selector.xpath("//div[@class='tip'][2]/text()")[0] == u'学习经历': self.user['education'] = selector.xpath( "//div[@class='c'][4]/text()")[0][1:].replace( @@ -1154,24 +1155,28 @@ def start(self): traceback.print_exc() +def get_config(): + """获取config.json数据""" + src = os.path.split(os.path.realpath(__file__))[0] + os.sep + 'config.json' + config_path = os.getcwd() + os.sep + 'config.json' + if FLAGS.config_path: + config_path = FLAGS.config_path + elif not os.path.isfile(config_path): + shutil.copy(src, config_path) + sys.exit(u'请先配置当前目录(%s)下的config.json文件' % os.getcwd()) + try: + with open(config_path) as f: + config = json.loads(f.read()) + return config + except ValueError: + sys.exit(u'config.json 格式不正确,请参考 ' + u'https://github.com/dataabc/weiboSpider#3程序设置') + + def main(argv): del argv # useless - try: - if FLAGS.config_path is not None: - config_path = FLAGS.config_path - else: - config_path = os.path.split( - os.path.realpath(__file__))[0] + os.sep + 'config.json' - if not os.path.isfile(config_path): - sys.exit(u'当前路径:%s 不存在配置文件config.json' % - (os.path.split(os.path.realpath(__file__))[0] + os.sep)) - with open(config_path) as f: - try: - config = json.loads(f.read()) - except ValueError: - sys.exit(u'config.json 格式不正确,请参考 ' - u'https://github.com/dataabc/weiboSpider#3程序设置') + config = get_config() wb = Weibo(config) wb.start() # 爬取微博信息 except Exception as e: From 11ddfaf1bd6442ba66c1de1c6a4b65cae0c2cb62 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Mon, 1 Jun 2020 20:41:43 +0800 Subject: [PATCH 162/363] Update README.md --- README.md | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 33142e5c..d7b7b8fc 100644 --- a/README.md +++ b/README.md @@ -188,7 +188,7 @@ $ pip install -r requirements.txt 运行上述命令,将本项目下载到当前目录,如果下载成功当前目录会出现一个名为"weibospider"的文件夹;
**pip安装** ```bash -$ python -m pip install weibo-spider +$ python3 -m pip install weibo-spider ``` ### 2.程序设置 如果你使用的是**源码下载安装**,请打开**config.json**文件,你会看到如下内容: @@ -210,7 +210,7 @@ $ python -m pip install weibo-spider } } ``` -如果你使用的是**pip安装**,你需要在任意目录下,创建上面的config.json文件。
+如果你使用的是**pip安装**,第一次执行[运行脚本](#4运行脚本)中的命令,程序会自动创建上面的config.json文件。
下面讲解每个参数的含义与设置方法。
**设置user_id_list**
user_id_list是我们要爬取的微博的id,可以是一个,也可以是多个,例如: @@ -321,21 +321,19 @@ MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为 ```bash $ python weiboSpider.py ``` -**pip安装**的用户可以在config.json文件所在目录运行 +**pip安装**的用户可以在任意有写权限的目录运行 ```bash -$ python -m weibo_spider --config_path="config.json" +$ python3 -m weibo_spider +``` +第一次执行,会自动在当前目录创建config.json配置文件,配置好后执行同样的命令就可以获取微博了。如果你已经有config.json文件了,也可以通过config_path参数配置config.json路径,运行程序,命令行如下: +```bash +$ python3 -m weibo_spider --config_path="config.json" ``` ### 5.按需求修改脚本(可选) 本部分为可选部分,如果你不需要自己修改代码或添加新功能,可以忽略此部分。
本程序所有代码都位于weiboSpider.py文件,程序主体是一个Weibo类,上述所有功能都是通过在main函数调用Weibo类实现的,默认的调用代码如下: ```python - config_path = os.path.split( - os.path.realpath(__file__))[0] + os.sep + 'config.json' - if not os.path.isfile(config_path): - sys.exit(u'当前路径:%s 不存在配置文件config.json' % - (os.path.split(os.path.realpath(__file__))[0] + os.sep)) - with open(config_path) as f: - config = json.loads(f.read()) + config = get_config() wb = Weibo(config) wb.start() # 爬取微博信息 ``` From 9f8d277c9e2bc765d0930360dface7fa30ee59c4 Mon Sep 17 00:00:00 2001 From: dataabc Date: Tue, 2 Jun 2020 20:28:10 +0800 Subject: [PATCH 163/363] =?UTF-8?q?feat:=20=E4=B8=BApypi=E7=89=88user=5Fid?= =?UTF-8?q?=5Flist=E5=8F=82=E6=95=B0=E6=B7=BB=E5=8A=A0=E7=9B=B8=E5=AF=B9?= =?UTF-8?q?=E8=B7=AF=E5=BE=84=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit config.json文件的user_id_list参数如果是文件路径,该路径即可以是文件的绝对地址,也可以是文件在**命令行当前目录**的相对地址。如,在/home/test目录执行程序,user_id_list.txt文件可以放在该目录下,它的相对路径即config.json中user_id_list参数的值是“user_id_list.txt” Issue #160 --- weibo_spider/weiboSpider.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/weibo_spider/weiboSpider.py b/weibo_spider/weiboSpider.py index 50492251..bd2a4005 100644 --- a/weibo_spider/weiboSpider.py +++ b/weibo_spider/weiboSpider.py @@ -47,8 +47,7 @@ def __init__(self, config): user_id_list = config['user_id_list'] if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): - user_id_list = os.path.split( - os.path.realpath(__file__))[0] + os.sep + user_id_list + user_id_list = os.getcwd() + os.sep + user_id_list self.user_config_file_path = user_id_list # 用户配置文件路径 user_config_list = self.get_user_config_list(user_id_list) else: @@ -96,8 +95,7 @@ def validate_config(self, config): sys.exit(u'user_id_list值应为list类型或txt文件路径') if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): - user_id_list = os.path.split( - os.path.realpath(__file__))[0] + os.sep + user_id_list + user_id_list = os.getcwd() + os.sep + user_id_list if not os.path.isfile(user_id_list): sys.exit(u'不存在%s文件' % user_id_list) From 9da8f6830861f9d78bfe0557eff0470ec9acf6df Mon Sep 17 00:00:00 2001 From: dataabc Date: Tue, 2 Jun 2020 20:47:50 +0800 Subject: [PATCH 164/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=8F=AA?= =?UTF-8?q?=E8=83=BD=E5=B0=86=E7=BB=93=E6=9E=9C=E6=96=87=E4=BB=B6=E5=86=99?= =?UTF-8?q?=E5=85=A5=E7=A8=8B=E5=BA=8F=E6=89=80=E5=9C=A8=E7=9B=AE=E5=BD=95?= =?UTF-8?q?=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #160 --- weibo_spider/weiboSpider.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/weibo_spider/weiboSpider.py b/weibo_spider/weiboSpider.py index bd2a4005..335bc441 100644 --- a/weibo_spider/weiboSpider.py +++ b/weibo_spider/weiboSpider.py @@ -739,9 +739,8 @@ def get_one_page(self, page): def get_filepath(self, type): """获取结果文件路径""" try: - file_dir = os.path.split( - os.path.realpath(__file__) - )[0] + os.sep + 'weibo' + os.sep + self.user['nickname'] + file_dir = os.getcwd( + ) + os.sep + 'weibo' + os.sep + self.user['nickname'] if type == 'img' or type == 'video': file_dir = file_dir + os.sep + type if not os.path.isdir(file_dir): From 4c276c3b18703aaad4facf6f7f74ed9543416342 Mon Sep 17 00:00:00 2001 From: dataabc Date: Tue, 2 Jun 2020 20:53:34 +0800 Subject: [PATCH 165/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8Dlog.txt?= =?UTF-8?q?=E5=8F=AA=E8=83=BD=E5=86=99=E5=85=A5=E7=A8=8B=E5=BA=8F=E7=9B=AE?= =?UTF-8?q?=E5=BD=95=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/weiboSpider.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/weibo_spider/weiboSpider.py b/weibo_spider/weiboSpider.py index 335bc441..05b2ae8a 100644 --- a/weibo_spider/weiboSpider.py +++ b/weibo_spider/weiboSpider.py @@ -756,8 +756,7 @@ def get_filepath(self, type): def write_log(self): """当程序因cookie过期停止运行时,将相关信息写入log.txt""" - file_dir = os.path.split( - os.path.realpath(__file__))[0] + os.sep + 'weibo' + os.sep + file_dir = os.getcwd() + os.sep + 'weibo' + os.sep if not os.path.isdir(file_dir): os.makedirs(file_dir) file_path = file_dir + 'log.txt' From 93b48ee0bc3d5739958d4e95156ed3cedad5b809 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Tue, 2 Jun 2020 09:38:40 +0800 Subject: [PATCH 166/363] Code refactor. related #160 1. csv, txt, json, mongo, mysql writer verified. 2. img, video downloader verified. --- .gitignore | 11 +- downloader.py | 68 ---- html_parser.py | 325 ---------------- printer.py | 21 - requirements.txt | 1 + setup.py | 27 ++ spider.py | 166 -------- user_id_list.txt | 1 - validator.py | 44 --- weibo_spider/__init__,py | 0 weibo_spider/__main__.py | 5 + .../config.json | 10 +- weibo_spider/config_util.py | 98 +++++ weibo_spider/datetime_util.py | 10 + weibo_spider/downloader/__init__.py | 4 + weibo_spider/downloader/downloader.py | 61 +++ weibo_spider/downloader/img_downloader.py | 36 ++ weibo_spider/downloader/video_downloader.py | 20 + weibo_spider/parser/__init__.py | 4 + weibo_spider/parser/comment_parser.py | 43 +++ weibo_spider/parser/index_parser.py | 59 +++ weibo_spider/parser/info_parser.py | 57 +++ weibo_spider/parser/page_parser.py | 361 ++++++++++++++++++ weibo_spider/parser/parser.py | 5 + weibo_spider/parser/util.py | 31 ++ weibo_spider/printer.py | 24 ++ weibo_spider/spider.py | 260 +++++++++++++ weibo_spider/user_id_list.txt | 1 + weibo_spider/writer/__init__.py | 7 + weibo_spider/writer/csv_writer.py | 68 ++++ weibo_spider/writer/json_writer.py | 50 +++ weibo_spider/writer/mongo_writer.py | 46 +++ weibo_spider/writer/mysql_writer.py | 126 ++++++ weibo_spider/writer/txt_writer.py | 67 ++++ weibo_spider/writer/writer.py | 9 + writer.py | 317 --------------- 36 files changed, 1494 insertions(+), 949 deletions(-) delete mode 100644 downloader.py delete mode 100644 html_parser.py delete mode 100644 printer.py create mode 100644 setup.py delete mode 100644 spider.py delete mode 100644 user_id_list.txt delete mode 100644 validator.py create mode 100644 weibo_spider/__init__,py create mode 100644 weibo_spider/__main__.py rename config_sample.json => weibo_spider/config.json (62%) create mode 100644 weibo_spider/config_util.py create mode 100644 weibo_spider/datetime_util.py create mode 100644 weibo_spider/downloader/__init__.py create mode 100644 weibo_spider/downloader/downloader.py create mode 100644 weibo_spider/downloader/img_downloader.py create mode 100644 weibo_spider/downloader/video_downloader.py create mode 100644 weibo_spider/parser/__init__.py create mode 100644 weibo_spider/parser/comment_parser.py create mode 100644 weibo_spider/parser/index_parser.py create mode 100644 weibo_spider/parser/info_parser.py create mode 100644 weibo_spider/parser/page_parser.py create mode 100644 weibo_spider/parser/parser.py create mode 100644 weibo_spider/parser/util.py create mode 100644 weibo_spider/printer.py create mode 100644 weibo_spider/spider.py create mode 100644 weibo_spider/user_id_list.txt create mode 100644 weibo_spider/writer/__init__.py create mode 100644 weibo_spider/writer/csv_writer.py create mode 100644 weibo_spider/writer/json_writer.py create mode 100644 weibo_spider/writer/mongo_writer.py create mode 100644 weibo_spider/writer/mysql_writer.py create mode 100644 weibo_spider/writer/txt_writer.py create mode 100644 weibo_spider/writer/writer.py delete mode 100644 writer.py diff --git a/.gitignore b/.gitignore index 60c44892..66ce8c5e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,10 @@ -config.json +.vscode *.pyc -weibo/ \ No newline at end of file +__pycache__ + +build/ +dist/ +*.egg-info + +weibo/ +config_sample.json \ No newline at end of file diff --git a/downloader.py b/downloader.py deleted file mode 100644 index b07bab60..00000000 --- a/downloader.py +++ /dev/null @@ -1,68 +0,0 @@ -# -*- coding: UTF-8 -*- -import os -import sys -import traceback - -import requests -from requests.adapters import HTTPAdapter -from tqdm import tqdm - - -class Downloader: - def __init__(self, config): - self.config = config - - def download_files(self, file_path, type, weibo): - """下载文件(图片/视频)""" - try: - if type == 'img': - describe = u'图片' - key = 'original_pictures' - else: - describe = u'视频' - key = 'video_url' - print(u'即将进行%s下载' % describe) - for w in tqdm(weibo, desc='Download progress'): - if w[key] != u'无': - file_prefix = w['publish_time'][:11].replace( - '-', '') + '_' + w['id'] - if type == 'img' and ',' in w[key]: - w[key] = w[key].split(',') - for j, url in enumerate(w[key]): - file_suffix = url[url.rfind('.'):] - file_name = file_prefix + '_' + str( - j + 1) + file_suffix - self.download_one_file( - url, file_path + os.sep + file_name, type, - w['id']) - else: - if type == 'video': - file_suffix = '.mp4' - else: - file_suffix = w[key][w[key].rfind('.'):] - file_name = file_prefix + file_suffix - self.download_one_file(w[key], - file_path + os.sep + file_name, - type, w['id']) - print(u'%s下载完毕,保存路径:' % describe) - print(file_path) - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def download_one_file(self, url, file_path, type, weibo_id): - """下载单个文件(图片/视频)""" - try: - if not os.path.isfile(file_path): - s = requests.Session() - s.mount(url, HTTPAdapter(max_retries=5)) - downloaded = s.get(url, timeout=(5, 10)) - with open(file_path, 'wb') as f: - f.write(downloaded.content) - except Exception as e: - error_file = './not_downloaded.txt' - with open(error_file, 'ab') as f: - url = weibo_id + ':' + url + '\n' - f.write(url.encode(sys.stdout.encoding)) - print('Error: ', e) - traceback.print_exc() diff --git a/html_parser.py b/html_parser.py deleted file mode 100644 index 40049f5b..00000000 --- a/html_parser.py +++ /dev/null @@ -1,325 +0,0 @@ -# -*- coding: UTF-8 -*- -import re -import sys -import traceback -from collections import OrderedDict -from datetime import datetime, timedelta - -import requests -from lxml import etree - - -class Parser: - def __init__(self, config): - self.config = config - - def deal_html(self, url, cookie): - """处理html""" - print("url:", url) - html = requests.get(url, cookies=cookie).content - selector = etree.HTML(html) - return selector - - def deal_garbled(self, info): - """处理乱码""" - info = (info.xpath('string(.)').replace(u'\u200b', '').encode( - sys.stdout.encoding, 'ignore').decode(sys.stdout.encoding)) - return info - - def extract_picture_urls(self, info, weibo_id): - """提取微博原始图片url""" - try: - a_list = info.xpath('div/a/@href') - first_pic = 'https://weibo.cn/mblog/pic/' + weibo_id + '?rl=0' - all_pic = 'https://weibo.cn/mblog/picAll/' + weibo_id + '?rl=1' - if first_pic in a_list: - if all_pic in a_list: - selector = self.deal_html(all_pic, self.config['cookie']) - preview_picture_list = selector.xpath('//img/@src') - picture_list = [ - p.replace('/thumb180/', '/large/') - for p in preview_picture_list - ] - picture_urls = ','.join(picture_list) - else: - if info.xpath('.//img/@src'): - preview_picture = info.xpath('.//img/@src')[-1] - picture_urls = preview_picture.replace( - '/wap180/', '/large/') - else: - sys.exit( - u"爬虫微博可能被设置成了'不显示图片',请前往" - u"'https://weibo.cn/account/customize/pic',修改为'显示'" - ) - else: - picture_urls = u'无' - return picture_urls - except Exception: - return u'无' - - def get_picture_urls(self, info, is_original): - """获取微博原始图片url""" - try: - weibo_id = info.xpath('@id')[0][2:] - picture_urls = {} - if is_original: - original_pictures = self.extract_picture_urls(info, weibo_id) - picture_urls['original_pictures'] = original_pictures - if not self.config['filter']: - picture_urls['retweet_pictures'] = u'无' - else: - retweet_url = info.xpath("div/a[@class='cc']/@href")[0] - retweet_id = retweet_url.split('/')[-1].split('?')[0] - retweet_pictures = self.extract_picture_urls(info, retweet_id) - picture_urls['retweet_pictures'] = retweet_pictures - a_list = info.xpath('div[last()]/a/@href') - original_picture = u'无' - for a in a_list: - if a.endswith(('.gif', '.jpeg', '.jpg', '.png')): - original_picture = a - break - picture_urls['original_pictures'] = original_picture - return picture_urls - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def get_video_url(self, info, is_original): - """获取微博视频url""" - try: - if is_original: - div_first = info.xpath('div')[0] - a_list = div_first.xpath('.//a') - video_link = u'无' - for a in a_list: - if 'm.weibo.cn/s/video/show?object_id=' in a.xpath( - '@href')[0]: - video_link = a.xpath('@href')[0] - break - if video_link != u'无': - video_link = video_link.replace( - 'm.weibo.cn/s/video/show', 'm.weibo.cn/s/video/object') - wb_info = requests.get( - video_link, cookies=self.config['cookie']).json() - video_url = wb_info['data']['object']['stream'].get( - 'hd_url') - if not video_url: - video_url = wb_info['data']['object']['stream']['url'] - if not video_url: # 说明该视频为直播 - video_url = u'无' - else: - video_url = u'无' - return video_url - except Exception: - return u'无' - - def get_page_num(self, selector): - """获取微博总页数""" - - if selector.xpath("//input[@name='mp']") == []: - page_num = 1 - else: - page_num = (int)( - selector.xpath("//input[@name='mp']")[0].attrib['value']) - return page_num - - def get_long_weibo(self, weibo_link): - """获取长原创微博""" - - selector = self.deal_html(weibo_link, self.config['cookie']) - info = selector.xpath("//div[@class='c']")[1] - wb_content = self.deal_garbled(info) - wb_time = info.xpath("//span[@class='ct']/text()")[0] - weibo_content = wb_content[wb_content.find(':') + - 1:wb_content.rfind(wb_time)] - return weibo_content - - def get_original_weibo(self, info, weibo_id): - """获取原创微博""" - - weibo_content = self.deal_garbled(info) - weibo_content = weibo_content[:weibo_content.rfind(u'赞')] - a_text = info.xpath('div//a/text()') - if u'全文' in a_text: - weibo_link = 'https://weibo.cn/comment/' + weibo_id - wb_content = self.get_long_weibo(weibo_link) - if wb_content: - weibo_content = wb_content - return weibo_content - - def get_long_retweet(self, weibo_link): - """获取长转发微博""" - wb_content = self.get_long_weibo(weibo_link) - weibo_content = wb_content[:wb_content.rfind(u'原文转发')] - return weibo_content - - def get_retweet(self, info, weibo_id): - """获取转发微博""" - wb_content = self.deal_garbled(info) - wb_content = wb_content[wb_content.find(':') + - 1:wb_content.rfind(u'赞')] - wb_content = wb_content[:wb_content.rfind(u'赞')] - a_text = info.xpath('div//a/text()') - if u'全文' in a_text: - weibo_link = 'https://weibo.cn/comment/' + weibo_id - weibo_content = self.get_long_retweet(weibo_link) - if weibo_content: - wb_content = weibo_content - retweet_reason = self.deal_garbled(info.xpath('div')[-1]) - retweet_reason = retweet_reason[:retweet_reason.rindex(u'赞')] - original_user = info.xpath("div/span[@class='cmt']/a/text()") - if original_user: - original_user = original_user[0] - wb_content = (retweet_reason + '\n' + u'原始用户: ' + original_user + - '\n' + u'转发内容: ' + wb_content) - else: - wb_content = retweet_reason + '\n' + u'转发内容: ' + wb_content - return wb_content - - def is_original(self, info): - """判断微博是否为原创微博""" - is_original = info.xpath("div/span[@class='cmt']") - if len(is_original) > 3: - return False - else: - return True - - def get_weibo_content(self, info, is_original): - """获取微博内容""" - weibo_id = info.xpath('@id')[0][2:] - if is_original: - weibo_content = self.get_original_weibo(info, weibo_id) - else: - weibo_content = self.get_retweet(info, weibo_id) - return weibo_content - - def get_publish_place(self, info): - """获取微博发布位置""" - div_first = info.xpath('div')[0] - a_list = div_first.xpath('a') - publish_place = u'无' - for a in a_list: - if ('place.weibo.com' in a.xpath('@href')[0] - and a.xpath('text()')[0] == u'显示地图'): - weibo_a = div_first.xpath("span[@class='ctt']/a") - if len(weibo_a) >= 1: - publish_place = weibo_a[-1] - if (u'视频' == div_first.xpath("span[@class='ctt']/a/text()") - [-1][-2:]): - if len(weibo_a) >= 2: - publish_place = weibo_a[-2] - else: - publish_place = u'无' - publish_place = self.deal_garbled(publish_place) - break - return publish_place - - def get_publish_time(self, info): - """获取微博发布时间""" - try: - str_time = info.xpath("div/span[@class='ct']") - str_time = self.deal_garbled(str_time[0]) - publish_time = str_time.split(u'来自')[0] - if u'刚刚' in publish_time: - publish_time = datetime.now().strftime('%Y-%m-%d %H:%M') - elif u'分钟' in publish_time: - minute = publish_time[:publish_time.find(u'分钟')] - minute = timedelta(minutes=int(minute)) - publish_time = (datetime.now() - - minute).strftime('%Y-%m-%d %H:%M') - elif u'今天' in publish_time: - today = datetime.now().strftime('%Y-%m-%d') - time = publish_time[3:] - publish_time = today + ' ' + time - if len(publish_time) > 16: - publish_time = publish_time[:16] - elif u'月' in publish_time: - year = datetime.now().strftime('%Y') - month = publish_time[0:2] - day = publish_time[3:5] - time = publish_time[7:12] - publish_time = year + '-' + month + '-' + day + ' ' + time - else: - publish_time = publish_time[:16] - return publish_time - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def get_publish_tool(self, info): - """获取微博发布工具""" - try: - str_time = info.xpath("div/span[@class='ct']") - str_time = self.deal_garbled(str_time[0]) - if len(str_time.split(u'来自')) > 1: - publish_tool = str_time.split(u'来自')[1] - else: - publish_tool = u'无' - return publish_tool - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def get_weibo_footer(self, info): - """获取微博点赞数、转发数、评论数""" - try: - footer = {} - pattern = r'\d+' - str_footer = info.xpath('div')[-1] - str_footer = self.deal_garbled(str_footer) - str_footer = str_footer[str_footer.rfind(u'赞'):] - weibo_footer = re.findall(pattern, str_footer, re.M) - - up_num = int(weibo_footer[0]) - footer['up_num'] = up_num - - retweet_num = int(weibo_footer[1]) - footer['retweet_num'] = retweet_num - - comment_num = int(weibo_footer[2]) - footer['comment_num'] = comment_num - return footer - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def get_one_weibo(self, info): - """获取一条微博的全部信息""" - try: - weibo = OrderedDict() - is_original = self.is_original(info) - if (not self.config['filter']) or is_original: - weibo['id'] = info.xpath('@id')[0][2:] - weibo['content'] = self.get_weibo_content(info, - is_original) # 微博内容 - weibo['publish_place'] = self.get_publish_place(info) # 微博发布位置 - weibo['publish_time'] = self.get_publish_time(info) # 微博发布时间 - weibo['publish_tool'] = self.get_publish_tool(info) # 微博发布工具 - footer = self.get_weibo_footer(info) - weibo['up_num'] = footer['up_num'] # 微博点赞数 - weibo['retweet_num'] = footer['retweet_num'] # 转发数 - weibo['comment_num'] = footer['comment_num'] # 评论数 - - picture_urls = self.get_picture_urls(info, is_original) - weibo['original_pictures'] = picture_urls[ - 'original_pictures'] # 原创图片url - if not self.config['filter']: - weibo['retweet_pictures'] = picture_urls[ - 'retweet_pictures'] # 转发图片url - weibo['original'] = is_original # 是否原创微博 - weibo['video_url'] = self.get_video_url(info, - is_original) # 微博视频url - else: - weibo = None - return weibo - except Exception as e: - print('Error: ', e) - traceback.print_exc() - - def is_pinned_weibo(self, info): - """判断微博是否为置顶微博""" - kt = info.xpath(".//span[@class='kt']/text()") - if kt and kt[0] == u'置顶': - return True - else: - return False diff --git a/printer.py b/printer.py deleted file mode 100644 index 5f87bdbd..00000000 --- a/printer.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: UTF-8 -*- - - -class Printer: - def print_one_weibo(self, weibo): - """打印一条微博""" - print(weibo['content']) - print(u'微博发布位置:%s' % weibo['publish_place']) - print(u'微博发布时间:%s' % weibo['publish_time']) - print(u'微博发布工具:%s' % weibo['publish_tool']) - print(u'点赞数:%d' % weibo['up_num']) - print(u'转发数:%d' % weibo['retweet_num']) - print(u'评论数:%d' % weibo['comment_num']) - - def print_user_info(self, user): - """打印微博用户信息""" - print(u'用户昵称: %s' % user['nickname']) - print(u'用户id: %s' % user['id']) - print(u'微博数: %d' % user['weibo_num']) - print(u'关注数: %d' % user['following']) - print(u'粉丝数: %d' % user['followers']) diff --git a/requirements.txt b/requirements.txt index d0747725..806d086b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ lxml==4.3.4 requests==2.22.0 tqdm==4.32.2 +absl-py==0.9.0 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..25e34a9a --- /dev/null +++ b/setup.py @@ -0,0 +1,27 @@ +import setuptools + +with open("README.md", "r") as fh: + long_description = fh.read() + +setuptools.setup( + name="weibo-spider", + version="0.0.4", + author="Chen Lei", + author_email="chillychen1991@gmail.com", + description="新浪微博爬虫,用python爬取新浪微博数据。", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/dataabc/weiboSpider", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", + ], + install_requires=[ + 'lxml', + 'requests', + 'tqdm', + 'absl-py', + ], + python_requires='>=3.6', +) diff --git a/spider.py b/spider.py deleted file mode 100644 index 2e524c22..00000000 --- a/spider.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- - -import os -import random -import sys -from datetime import date, datetime, timedelta -from time import sleep - -from tqdm import tqdm - -from downloader import Downloader -from html_parser import Parser -from printer import Printer -from validator import Validator -from writer import Writer, get_filepath, write_log - - -class Spider(object): - def __init__(self, config): - """Weibo类初始化""" - self.config = config - # change cookie from string to dict - if type(self.config['cookie']) == type(u''): - self.config['cookie'] = { - t.strip().split("=")[0]: t.strip().split("=")[1] - for t in self.config['cookie'].split(";") - } - if type(self.config['user_id_list']) == type(u""): - user_id_list = self.config['user_id_list'] - if not os.path.isabs(user_id_list): - user_id_list = os.path.split( - os.path.realpath(__file__))[0] + os.sep + user_id_list - self.config['user_id_list'] = user_id_list - with open(self.config['user_id_list'], 'rb') as f: - lines = f.read().splitlines() - lines = [line.decode('utf-8') for line in lines] - self.config['user_id_list'] = [ - line.split(' ')[0] for line in lines if - len(line.split(' ')) > 0 and line.split(' ')[0].isdigit() - ] - if type(self.config['since_date']) == type(0): - self.config['since_date'] = str( - date.today() - timedelta(self.config['since_date'])) - - self.validator = Validator(self.config) - self.validator.validate() - self.printer = Printer() - self.writer = Writer(self.config) - self.downloader = Downloader(self.config) - self.parser = Parser(self.config) - - def get_nickname(self): - """获取用户昵称""" - url = 'https://weibo.cn/%s/info' % (self.user['id']) - selector = self.parser.deal_html(url, self.config['cookie']) - nickname = selector.xpath('//title/text()')[0] - nickname = nickname[:-3] - if nickname == u'登录 - 新' or nickname == u'新浪': - write_log(self.config['since_date']) - sys.exit(u'cookie错误或已过期,请按照README中方法重新获取') - self.user['nickname'] = nickname - - def get_user_info(self, selector): - """获取用户昵称、微博数、关注数、粉丝数""" - self.get_nickname() # 获取用户昵称 - user_info = selector.xpath("//div[@class='tip2']/*/text()") - - self.user['weibo_num'] = int(user_info[0][3:-1]) - self.user['following'] = int(user_info[1][3:-1]) - self.user['followers'] = int(user_info[2][3:-1]) - self.printer.print_user_info(self.user) - self.writer.write_user(self.user) - print('*' * 100) - - def get_one_page(self, page): - """获取第page页的全部微博""" - url = 'https://weibo.cn/u/%s?page=%d' % (self.user['id'], page) - selector = self.parser.deal_html(url, self.config['cookie']) - info = selector.xpath("//div[@class='c']") - is_exist = info[0].xpath("div/span[@class='ctt']") - if is_exist: - for i in range(0, len(info) - 2): - weibo = self.parser.get_one_weibo(info[i]) - if weibo: - if weibo['id'] in self.weibo_id_list: - continue - publish_time = datetime.strptime( - weibo['publish_time'][:10], "%Y-%m-%d") - since_date = datetime.strptime(self.config['since_date'], - "%Y-%m-%d") - if publish_time < since_date: - if self.parser.is_pinned_weibo(info[i]): - continue - else: - return True - self.printer.print_one_weibo(weibo) - - self.weibo.append(weibo) - self.weibo_id_list.append(weibo['id']) - self.got_num += 1 - print('-' * 100) - - self.writer.write_weibo([weibo]) - - def get_weibo_info(self): - """获取微博信息""" - url = 'https://weibo.cn/u/%s' % (self.user['id']) - selector = self.parser.deal_html(url, self.config['cookie']) - self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 - - page_num = self.parser.get_page_num(selector) # 获取微博总页数 - page1 = 0 - random_pages = random.randint(1, 5) - for page in tqdm(range(1, page_num + 1), desc='Progress'): - is_end = self.get_one_page(page) # 获取第page页的全部微博 - if is_end: - break - - # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限 - # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默 - # 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间 - if page - page1 == random_pages and page < page_num: - sleep(random.randint(6, 10)) - page1 = page - random_pages = random.randint(1, 5) - - if not self.config['filter']: - print(u'共爬取' + str(self.got_num) + u'条微博') - else: - print(u'共爬取' + str(self.got_num) + u'条原创微博') - - def initialize_info(self, user_id): - """初始化爬虫信息""" - self.got_num = 0 # 爬取到的微博数 - self.weibo = [] # 存储爬取到的所有微博信息 - self.user = {'id': user_id} # 存储爬取到的用户信息 - self.weibo_id_list = [] # 存储爬取到的所有微博id - - def start(self): - """运行爬虫""" - for user_id in self.config['user_id_list']: - self.initialize_info(user_id) - print('*' * 100) - self.get_weibo_info() - print(u'信息抓取完毕') - print('*' * 100) - if self.config['pic_download'] == 1: - file_path = get_filepath('img', self.user['nickname']) - self.downloader.download_files(file_path, 'img', self.weibo) - if self.config['video_download'] == 1: - file_path = get_filepath('video', self.user['nickname']) - self.downloader.download_files(file_path, 'video', self.weibo) - - -if __name__ == '__main__': - import json - config_path = os.path.split( - os.path.realpath(__file__))[0] + os.sep + 'config.json' - if not os.path.isfile(config_path): - sys.exit(u'当前路径:%s 不存在配置文件config.json' % - (os.path.split(os.path.realpath(__file__))[0] + os.sep)) - with open(config_path) as f: - config = json.loads(f.read()) - spider = Spider(config) - spider.start() # 爬取微博信息 diff --git a/user_id_list.txt b/user_id_list.txt deleted file mode 100644 index 3acc69f8..00000000 --- a/user_id_list.txt +++ /dev/null @@ -1 +0,0 @@ -7053204102 majiko \ No newline at end of file diff --git a/validator.py b/validator.py deleted file mode 100644 index 3d31b437..00000000 --- a/validator.py +++ /dev/null @@ -1,44 +0,0 @@ -# -*- coding: UTF-8 -*- - -from datetime import datetime -import sys - - -def is_date(since_date): - """判断日期格式是否正确""" - try: - datetime.strptime(since_date, "%Y-%m-%d") - return True - except: - return False - - -class Validator: - def __init__(self, config): - """ - self.user_id_list = '' # 1. 用户id list,如昵称为"Dear-迪丽热巴"的id为'1669879400';2. 存储用户id list 的文件名 - self.since_date = since_date # 1. 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd 2. 起始时间距离今天的天数,形式为一个整数 - self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 - self.mongodb_write = mongodb_write # 值为0代表不将结果写入MongoDB数据库,1代表写入 - self.mysql_write = mysql_write # 值为0代表不将结果写入MySQL数据库,1代表写入 - self.pic_download = pic_download # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 - self.video_download = video_download # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 - self.mysql_config = { - } # MySQL数据库连接配置,可以不填,当使用者的mysql用户名、密码等与本程序默认值不同时,需要通过mysql_config来自定义 - """ - self.config = config - - def validate(self): - bool_config = ["filter", "pic_download", "video_download"] - date_config = ["since_date"] - - for key in bool_config: - if self.config[key] not in [0, 1]: - sys.exit("%s值应为0或1,请重新输入" % key) - for key in date_config: - if not (type(self.config[key]) == type(0) - or is_date(self.config[key])): - sys.exit("%s值应为yyyy-mm-dd形式或整数,请重新输入" % key) - for mode in self.config['write_mode']: - if mode not in ['txt', 'csv', 'mysql', 'mongo']: - sys.exit("write_mode值应为txt,csv,mysql,mongo,请重新输入") diff --git a/weibo_spider/__init__,py b/weibo_spider/__init__,py new file mode 100644 index 00000000..e69de29b diff --git a/weibo_spider/__main__.py b/weibo_spider/__main__.py new file mode 100644 index 00000000..d7d8fc71 --- /dev/null +++ b/weibo_spider/__main__.py @@ -0,0 +1,5 @@ +from .spider import main + +from absl import app + +app.run(main) diff --git a/config_sample.json b/weibo_spider/config.json similarity index 62% rename from config_sample.json rename to weibo_spider/config.json index 0f9f20b1..79420dbb 100644 --- a/config_sample.json +++ b/weibo_spider/config.json @@ -1,10 +1,10 @@ { "filter": 1, - "since_date": 10, - "write_mode": ["csv", "txt"], - "pic_download": 1, - "video_download": 1, - "cookie": "your cookie", + "since_date": "30", + "write_mode": ["csv"], + "pic_download": 0, + "video_download": 0, + "cookie": "", "mysql_config": { "host": "localhost", "port": 3306, diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py new file mode 100644 index 00000000..7eb5b199 --- /dev/null +++ b/weibo_spider/config_util.py @@ -0,0 +1,98 @@ +import codecs +import os +import sys +from datetime import datetime + + +def _is_date(since_date): + """判断日期格式是否正确""" + try: + datetime.strptime(since_date, "%Y-%m-%d") + return True + except ValueError: + return False + + +def validate_config(config): + """验证配置是否正确""" + + # 验证filter、pic_download、video_download + argument_list = ["filter", "pic_download", "video_download"] + for argument in argument_list: + if config[argument] != 0 and config[argument] != 1: + sys.exit(u"%s值应为0或1,请重新输入" % config[argument]) + + # 验证since_date + since_date = str(config["since_date"]) + if (not _is_date(since_date)) and (not since_date.isdigit()): + sys.exit(u"since_date值应为yyyy-mm-dd形式或整数,请重新输入") + + # 验证write_mode + write_mode = ["txt", "csv", "json", "mongo", "mysql"] + if not isinstance(config["write_mode"], list): + sys.exit(u"write_mode值应为list类型") + for mode in config["write_mode"]: + if mode not in write_mode: + sys.exit(u"%s为无效模式,请从txt、csv、json、mongo和mysql中挑选一个或多个作为write_mode" % mode) + + # 验证user_id_list + user_id_list = config["user_id_list"] + if (not isinstance(user_id_list, list)) and (not user_id_list.endswith(".txt")): + sys.exit(u"user_id_list值应为list类型或txt文件路径") + if not isinstance(user_id_list, list): + if not os.path.isabs(user_id_list): + user_id_list = ( + os.path.split(os.path.realpath(__file__))[0] + os.sep + user_id_list + ) + if not os.path.isfile(user_id_list): + sys.exit(u"不存在%s文件" % user_id_list) + + +def get_user_config_list(file_name, default_since_date): + """获取文件中的微博id信息""" + with open(file_name, "rb") as f: + try: + lines = f.read().splitlines() + lines = [line.decode("utf-8-sig") for line in lines] + except UnicodeDecodeError: + sys.exit(u"%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序" % file_name) + user_config_list = [] + for line in lines: + info = line.split(" ") + if len(info) > 0: + user_config = {} + user_config["user_uri"] = info[0] + if len(info) > 2 and _is_date(info[2]): + if len(info) > 3 and _is_date(info[2] + " " + info[3]): + user_config["since_date"] = info[2] + " " + info[3] + else: + user_config["since_date"] = info[2] + else: + user_config["since_date"] = default_since_date + if user_config not in user_config_list: + user_config_list.append(user_config) + return user_config_list + + +def update_user_config_file(user_config_file_path, user_uri, nickname, start_time): + """更新用户配置文件""" + with open(user_config_file_path, "rb") as f: + lines = f.read().splitlines() + lines = [line.decode("utf-8-sig") for line in lines] + for i, line in enumerate(lines): + info = line.split(" ") + if len(info) > 0: + if user_uri == info[0]: + if len(info) == 1: + info.append(nickname) + info.append(start_time) + if len(info) == 2: + info.append(start_time) + if len(info) > 3 and _is_date(info[2] + " " + info[3]): + del info[3] + if len(info) > 2: + info[2] = start_time + lines[i] = " ".join(info) + break + with codecs.open(user_config_file_path, "w", encoding="utf-8") as f: + f.write("\n".join(lines)) diff --git a/weibo_spider/datetime_util.py b/weibo_spider/datetime_util.py new file mode 100644 index 00000000..1228af00 --- /dev/null +++ b/weibo_spider/datetime_util.py @@ -0,0 +1,10 @@ +from datetime import datetime + + +def str_to_time(text): + """将字符串转换成时间类型""" + if ':' in text: + result = datetime.strptime(text, '%Y-%m-%d %H:%M') + else: + result = datetime.strptime(text, '%Y-%m-%d') + return result diff --git a/weibo_spider/downloader/__init__.py b/weibo_spider/downloader/__init__.py new file mode 100644 index 00000000..9d8c4f0d --- /dev/null +++ b/weibo_spider/downloader/__init__.py @@ -0,0 +1,4 @@ +from .img_downloader import ImgDownloader +from .video_downloader import VideoDownloader + +__all__ = [ImgDownloader, VideoDownloader] diff --git a/weibo_spider/downloader/downloader.py b/weibo_spider/downloader/downloader.py new file mode 100644 index 00000000..56331567 --- /dev/null +++ b/weibo_spider/downloader/downloader.py @@ -0,0 +1,61 @@ +# -*- coding: UTF-8 -*- +import os +import sys +import traceback + +import requests +from requests.adapters import HTTPAdapter +from tqdm import tqdm + + +class Downloader: + def __init__(self, file_dir): + self.file_dir = file_dir + + self.file_type = "" + self.describe = u"" + self.key = "" + + def get_filepath(self): + """获取结果文件路径""" + try: + file_dir = self.file_dir + os.sep + self.file_type + if not os.path.isdir(file_dir): + os.makedirs(file_dir) + return file_dir + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def download_one_file(self, url, file_path, weibo_id): + """下载单个文件(图片/视频)""" + try: + if not os.path.isfile(file_path): + s = requests.Session() + s.mount(url, HTTPAdapter(max_retries=5)) + downloaded = s.get(url, timeout=(5, 10)) + with open(file_path, "wb") as f: + f.write(downloaded.content) + except Exception as e: + error_file = self.get_filepath() + os.sep + "not_downloaded.txt" + with open(error_file, "ab") as f: + url = weibo_id + ":" + url + "\n" + f.write(url.encode(sys.stdout.encoding)) + print("Error: ", e) + traceback.print_exc() + + def handle_download(self): + pass + + def download_files(self, weibos): + """下载文件(图片/视频)""" + try: + print(u"即将进行%s下载" % self.describe) + for w in tqdm(weibos, desc="Download progress"): + if w[self.key] != u"无": + self.handle_download(w[self.key], w) + print(u"%s下载完毕,保存路径:" % self.describe) + print(self.file_dir) + except Exception as e: + print("Error: ", e) + traceback.print_exc() diff --git a/weibo_spider/downloader/img_downloader.py b/weibo_spider/downloader/img_downloader.py new file mode 100644 index 00000000..88e9656d --- /dev/null +++ b/weibo_spider/downloader/img_downloader.py @@ -0,0 +1,36 @@ +import os + +from .downloader import Downloader + + +class ImgDownloader(Downloader): + def __init__(self, file_dir): + self.file_dir = file_dir + + self.file_type = "img" + self.describe = u"图片" + self.key = "original_pictures" + + def handle_download(self, urls, w): + """处理下载相关操作""" + file_prefix = w["publish_time"][:11].replace("-", "") + "_" + w["id"] + if "," in urls: + url_list = urls.split(",") + for i, url in enumerate(url_list): + index = url.rfind(".") + if len(url) - index >= 5: + file_suffix = ".jpg" + else: + file_suffix = url[index:] + file_name = file_prefix + "_" + str(i + 1) + file_suffix + file_path = self.file_dir + os.sep + file_name + self.download_one_file(url, file_path, w["id"]) + else: + index = urls.rfind(".") + if len(urls) - index > 5: + file_suffix = ".jpg" + else: + file_suffix = urls[index:] + file_name = file_prefix + file_suffix + file_path = self.file_dir + os.sep + file_name + self.download_one_file(urls, file_path, w["id"]) diff --git a/weibo_spider/downloader/video_downloader.py b/weibo_spider/downloader/video_downloader.py new file mode 100644 index 00000000..caa21e91 --- /dev/null +++ b/weibo_spider/downloader/video_downloader.py @@ -0,0 +1,20 @@ +import os + +from .downloader import Downloader + + +class VideoDownloader(Downloader): + def __init__(self, file_dir): + self.file_dir = file_dir + + self.file_type = "img" + self.describe = u"视频" + self.key = "video_url" + + def handle_download(self, urls, w): + """处理下载相关操作""" + file_prefix = w["publish_time"][:11].replace("-", "") + "_" + w["id"] + file_suffix = ".mp4" + file_name = file_prefix + file_suffix + file_path = self.file_dir + os.sep + file_name + self.download_one_file(urls, file_path, w["id"]) diff --git a/weibo_spider/parser/__init__.py b/weibo_spider/parser/__init__.py new file mode 100644 index 00000000..46f81f0e --- /dev/null +++ b/weibo_spider/parser/__init__.py @@ -0,0 +1,4 @@ +from .index_parser import IndexParser +from .page_parser import PageParser + +__all__ = [IndexParser, PageParser] diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/parser/comment_parser.py new file mode 100644 index 00000000..157f0bc0 --- /dev/null +++ b/weibo_spider/parser/comment_parser.py @@ -0,0 +1,43 @@ +from time import sleep +import random +import traceback + +from .parser import Parser +from .util import handle_html + + +class CommentParser(Parser): + def __init__(self, cookie, weibo_id): + self.cookie = cookie + self.url = "https://weibo.cn/comment/" + weibo_id + self.selector = handle_html(self.cookie, self.url) + + def get_long_weibo(self): + """获取长原创微博""" + try: + for i in range(5): + self.selector = self.handle_html(self.cookie, self.url) + if self.selector is not None: + info = self.selector.xpath("//div[@class='c']")[1] + wb_content = self.handle_garbled(info) + wb_time = info.xpath("//span[@class='ct']/text()")[0] + weibo_content = wb_content[ + wb_content.find(":") + 1 : wb_content.rfind(wb_time) + ] + if weibo_content is not None: + return weibo_content + sleep(random.randint(6, 10)) + except Exception as e: + return u"网络出错" + print("Error: ", e) + traceback.print_exc() + + def get_long_retweet(self): + """获取长转发微博""" + try: + wb_content = self.get_long_weibo() + weibo_content = wb_content[: wb_content.rfind(u"原文转发")] + return weibo_content + except Exception as e: + print("Error: ", e) + traceback.print_exc() diff --git a/weibo_spider/parser/index_parser.py b/weibo_spider/parser/index_parser.py new file mode 100644 index 00000000..6bcfee66 --- /dev/null +++ b/weibo_spider/parser/index_parser.py @@ -0,0 +1,59 @@ +import traceback + +from .util import handle_html +from .parser import Parser +from .info_parser import InfoParser + + +class IndexParser(Parser): + def __init__(self, cookie, user_uri): + self.cookie = cookie + self.user_uri = user_uri + self.url = "https://weibo.cn/%s" % (user_uri) + self.selector = handle_html(self.cookie, self.url) + + def _get_user_id(self): + """获取用户id,使用者输入的user_id不一定是正确的,可能是个性域名等,需要获取真正的user_id""" + user_id = self.user_uri + url_list = self.selector.xpath("//div[@class='u']//a") + for url in url_list: + if (url.xpath("string(.)")) == u"资料": + if url.xpath("@href") and url.xpath("@href")[0].endswith("/info"): + link = url.xpath("@href")[0] + user_id = link[1:-5] + break + return user_id + + def get_user(self): + """获取用户信息、微博数、关注数、粉丝数""" + try: + self.user = {} + self.user["id"] = self._get_user_id() + user = InfoParser(self.cookie, self.user["id"]).extract_user_info() # 获取用户信息 + for k, v in user.items(): + self.user[k] = v + user_info = self.selector.xpath("//div[@class='tip2']/*/text()") + weibo_num = int(user_info[0][3:-1]) + following = int(user_info[1][3:-1]) + followers = int(user_info[2][3:-1]) + self.user["weibo_num"] = weibo_num + self.user["following"] = following + self.user["followers"] = followers + return self.user + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def get_page_num(self): + """获取微博总页数""" + try: + if self.selector.xpath("//input[@name='mp']") == []: + page_num = 1 + else: + page_num = (int)( + self.selector.xpath("//input[@name='mp']")[0].attrib["value"] + ) + return page_num + except Exception as e: + print("Error: ", e) + traceback.print_exc() diff --git a/weibo_spider/parser/info_parser.py b/weibo_spider/parser/info_parser.py new file mode 100644 index 00000000..0b7f990f --- /dev/null +++ b/weibo_spider/parser/info_parser.py @@ -0,0 +1,57 @@ +import traceback +import sys + +from .util import handle_html +from .parser import Parser + + +class InfoParser(Parser): + def __init__(self, cookie, user_id): + self.cookie = cookie + self.url = "https://weibo.cn/%s/info" % (user_id) + self.selector = handle_html(self.cookie, self.url) + + def extract_user_info(self): + """提取用户信息""" + try: + user = {} + nickname = self.selector.xpath("//title/text()")[0] + nickname = nickname[:-3] + if nickname == u"登录 - 新" or nickname == u"新浪": + sys.exit(u"cookie错误或已过期,请按照README中方法重新获取") + user["nickname"] = nickname + basic_info = self.selector.xpath("//div[@class='c'][3]/text()") + zh_list = [u"性别", u"地区", u"生日", u"简介", u"认证", u"达人"] + en_list = [ + "gender", + "location", + "birthday", + "description", + "verified_reason", + "talent", + "education", + "work", + ] + for i in en_list: + user[i] = "" + for i in basic_info: + if i.split(":", 1)[0] in zh_list: + user[en_list[zh_list.index(i.split(":", 1)[0])]] = i.split(":", 1)[ + 1 + ].replace("\u3000", "") + if self.selector.xpath("//div[@class='tip'][2]/text()")[0] == u"学习经历": + user["education"] = self.selector.xpath("//div[@class='c'][4]/text()")[ + 0 + ][1:].replace(u"\xa0", u" ") + if self.selector.xpath("//div[@class='tip'][3]/text()")[0] == u"工作经历": + user["work"] = self.selector.xpath("//div[@class='c'][5]/text()")[ + 0 + ][1:].replace(u"\xa0", u" ") + elif self.selector.xpath("//div[@class='tip'][2]/text()")[0] == u"工作经历": + user["work"] = self.selector.xpath("//div[@class='c'][4]/text()")[0][ + 1: + ].replace(u"\xa0", u" ") + return user + except Exception as e: + print("Error: ", e) + traceback.print_exc() diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py new file mode 100644 index 00000000..1cd87e55 --- /dev/null +++ b/weibo_spider/parser/page_parser.py @@ -0,0 +1,361 @@ +from datetime import datetime, timedelta +import traceback +from collections import OrderedDict +import re +import sys + +import requests + +from .parser import Parser +from .comment_parser import CommentParser +from .util import handle_html, handle_garbled +from .. import printer, datetime_util + + +class PageParser(Parser): + def __init__(self, cookie, user_uri, page, filter): + self.cookie = cookie + self.page = page + self.url = "https://weibo.cn/%s?page=%d" % (user_uri, page) + self.selector = handle_html(self.cookie, self.url) + self.filter = filter + + def get_one_page(self, since_date, weibo_id_list): + """获取第page页的全部微博""" + try: + info = self.selector.xpath("//div[@class='c']") + is_exist = info[0].xpath("div/span[@class='ctt']") + weibos = [] + if is_exist: + since_date = datetime_util.str_to_time(since_date) + for i in range(0, len(info) - 2): + weibo = self.get_one_weibo(info[i]) + if weibo: + if weibo["id"] in weibo_id_list: + continue + publish_time = datetime_util.str_to_time(weibo["publish_time"]) + + if publish_time < since_date: + if self.is_pinned_weibo(info[i]): + continue + else: + return weibos, weibo_id_list + printer.print_one_weibo(weibo) + weibos.append(weibo) + weibo_id_list.append(weibo["id"]) + return weibos, weibo_id_list + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def is_original(self, info): + """判断微博是否为原创微博""" + is_original = info.xpath("div/span[@class='cmt']") + if len(is_original) > 3: + return False + else: + return True + + def get_original_weibo(self, info, weibo_id): + """获取原创微博""" + try: + weibo_content = handle_garbled(info) + weibo_content = weibo_content[: weibo_content.rfind(u"赞")] + a_text = info.xpath("div//a/text()") + if u"全文" in a_text: + wb_content = CommentParser(self.cookie, weibo_id).get_long_weibo() + if wb_content: + weibo_content = wb_content + return weibo_content + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def get_retweet(self, info, weibo_id): + """获取转发微博""" + try: + weibo_content = handle_garbled(info) + weibo_content = weibo_content[ + weibo_content.find(":") + 1 : weibo_content.rfind(u"赞") + ] + weibo_content = weibo_content[: weibo_content.rfind(u"赞")] + a_text = info.xpath("div//a/text()") + if u"全文" in a_text: + wb_content = CommentParser(self.cookie, weibo_id).get_long_retweet() + if wb_content: + weibo_content = wb_content + retweet_reason = handle_garbled(info.xpath("div")[-1]) + retweet_reason = retweet_reason[: retweet_reason.rindex(u"赞")] + original_user = info.xpath("div/span[@class='cmt']/a/text()") + if original_user: + original_user = original_user[0] + weibo_content = ( + retweet_reason + + "\n" + + u"原始用户: " + + original_user + + "\n" + + u"转发内容: " + + weibo_content + ) + else: + weibo_content = retweet_reason + "\n" + u"转发内容: " + weibo_content + return weibo_content + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def get_weibo_content(self, info, is_original): + """获取微博内容""" + try: + weibo_id = info.xpath("@id")[0][2:] + if is_original: + weibo_content = self.get_original_weibo(info, weibo_id) + else: + weibo_content = self.get_retweet(info, weibo_id) + return weibo_content + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def get_article_url(self, info): + """获取微博头条文章的url""" + article_url = "" + text = handle_garbled(info) + if text.startswith(u"发布了头条文章"): + url = info.xpath(".//a/@href") + if url and url[0].startswith("https://weibo.cn/sinaurl"): + article_url = url[0] + return article_url + + def get_publish_place(self, info): + """获取微博发布位置""" + try: + div_first = info.xpath("div")[0] + a_list = div_first.xpath("a") + publish_place = u"无" + for a in a_list: + if ( + "place.weibo.com" in a.xpath("@href")[0] + and a.xpath("text()")[0] == u"显示地图" + ): + weibo_a = div_first.xpath("span[@class='ctt']/a") + if len(weibo_a) >= 1: + publish_place = weibo_a[-1] + if ( + u"视频" + == div_first.xpath("span[@class='ctt']/a/text()")[-1][-2:] + ): + if len(weibo_a) >= 2: + publish_place = weibo_a[-2] + else: + publish_place = u"无" + publish_place = handle_garbled(publish_place) + break + return publish_place + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def get_publish_time(self, info): + """获取微博发布时间""" + try: + str_time = info.xpath("div/span[@class='ct']") + str_time = handle_garbled(str_time[0]) + publish_time = str_time.split(u"来自")[0] + if u"刚刚" in publish_time: + publish_time = datetime.now().strftime("%Y-%m-%d %H:%M") + elif u"分钟" in publish_time: + minute = publish_time[: publish_time.find(u"分钟")] + minute = timedelta(minutes=int(minute)) + publish_time = (datetime.now() - minute).strftime("%Y-%m-%d %H:%M") + elif u"今天" in publish_time: + today = datetime.now().strftime("%Y-%m-%d") + time = publish_time[3:] + publish_time = today + " " + time + if len(publish_time) > 16: + publish_time = publish_time[:16] + elif u"月" in publish_time: + year = datetime.now().strftime("%Y") + month = publish_time[0:2] + day = publish_time[3:5] + time = publish_time[7:12] + publish_time = year + "-" + month + "-" + day + " " + time + else: + publish_time = publish_time[:16] + return publish_time + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def get_publish_tool(self, info): + """获取微博发布工具""" + try: + str_time = info.xpath("div/span[@class='ct']") + str_time = handle_garbled(str_time[0]) + if len(str_time.split(u"来自")) > 1: + publish_tool = str_time.split(u"来自")[1] + else: + publish_tool = u"无" + return publish_tool + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def get_weibo_footer(self, info): + """获取微博点赞数、转发数、评论数""" + try: + footer = {} + pattern = r"\d+" + str_footer = info.xpath("div")[-1] + str_footer = handle_garbled(str_footer) + str_footer = str_footer[str_footer.rfind(u"赞") :] + weibo_footer = re.findall(pattern, str_footer, re.M) + + up_num = int(weibo_footer[0]) + footer["up_num"] = up_num + + retweet_num = int(weibo_footer[1]) + footer["retweet_num"] = retweet_num + + comment_num = int(weibo_footer[2]) + footer["comment_num"] = comment_num + return footer + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def get_picture_urls(self, info, is_original): + """获取微博原始图片url""" + try: + weibo_id = info.xpath("@id")[0][2:] + picture_urls = {} + if is_original: + original_pictures = self.extract_picture_urls(info, weibo_id) + picture_urls["original_pictures"] = original_pictures + if not self.filter: + picture_urls["retweet_pictures"] = u"无" + else: + retweet_url = info.xpath("div/a[@class='cc']/@href")[0] + retweet_id = retweet_url.split("/")[-1].split("?")[0] + retweet_pictures = self.extract_picture_urls(info, retweet_id) + picture_urls["retweet_pictures"] = retweet_pictures + a_list = info.xpath("div[last()]/a/@href") + original_picture = u"无" + for a in a_list: + if a.endswith((".gif", ".jpeg", ".jpg", ".png")): + original_picture = a + break + picture_urls["original_pictures"] = original_picture + return picture_urls + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def get_video_url(self, info, is_original): + """获取微博视频url""" + try: + if is_original: + div_first = info.xpath("div")[0] + a_list = div_first.xpath(".//a") + video_link = u"无" + for a in a_list: + if "m.weibo.cn/s/video/show?object_id=" in a.xpath("@href")[0]: + video_link = a.xpath("@href")[0] + break + if video_link != u"无": + video_link = video_link.replace( + "m.weibo.cn/s/video/show", "m.weibo.cn/s/video/object" + ) + wb_info = requests.get(video_link, cookies=self.cookie).json() + video_url = wb_info["data"]["object"]["stream"].get("hd_url") + if not video_url: + video_url = wb_info["data"]["object"]["stream"]["url"] + if not video_url: # 说明该视频为直播 + video_url = u"无" + else: + video_url = u"无" + return video_url + except Exception as e: + return u"无" + print("Error: ", e) + traceback.print_exc() + + def is_pinned_weibo(self, info): + """判断微博是否为置顶微博""" + kt = info.xpath(".//span[@class='kt']/text()") + if kt and kt[0] == u"置顶": + return True + else: + return False + + def get_one_weibo(self, info): + """获取一条微博的全部信息""" + try: + weibo = OrderedDict() + is_original = self.is_original(info) + if (not self.filter) or is_original: + weibo["id"] = info.xpath("@id")[0][2:] + weibo["content"] = self.get_weibo_content(info, is_original) # 微博内容 + weibo["article_url"] = self.get_article_url(info) # 头条文章url + picture_urls = self.get_picture_urls(info, is_original) + weibo["original_pictures"] = picture_urls[ + "original_pictures" + ] # 原创图片url + if not self.filter: + weibo["retweet_pictures"] = picture_urls[ + "retweet_pictures" + ] # 转发图片url + weibo["original"] = is_original # 是否原创微博 + weibo["video_url"] = self.get_video_url(info, is_original) # 微博视频url + weibo["publish_place"] = self.get_publish_place(info) # 微博发布位置 + weibo["publish_time"] = self.get_publish_time(info) # 微博发布时间 + weibo["publish_tool"] = self.get_publish_tool(info) # 微博发布工具 + footer = self.get_weibo_footer(info) + weibo["up_num"] = footer["up_num"] # 微博点赞数 + weibo["retweet_num"] = footer["retweet_num"] # 转发数 + weibo["comment_num"] = footer["comment_num"] # 评论数 + else: + weibo = None + print(u"正在过滤转发微博") + return weibo + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def extract_picture_urls(self, info, weibo_id): + """提取微博原始图片url""" + try: + a_list = info.xpath("div/a/@href") + first_pic = "https://weibo.cn/mblog/pic/" + weibo_id + "?rl=0" + all_pic = "https://weibo.cn/mblog/picAll/" + weibo_id + "?rl=1" + picture_urls = u"无" + if first_pic in a_list: + if all_pic in a_list: + selector = handle_html(self.cookie, all_pic) + preview_picture_list = selector.xpath("//img/@src") + picture_list = [ + p.replace("/thumb180/", "/large/") for p in preview_picture_list + ] + picture_urls = ",".join(picture_list) + else: + if info.xpath(".//img/@src"): + for link in info.xpath("div/a"): + if len(link.xpath("@href")) > 0: + if first_pic == link.xpath("@href")[0]: + if len(link.xpath("img/@src")) > 0: + preview_picture = link.xpath("img/@src")[0] + picture_urls = preview_picture.replace( + "/wap180/", "/large/" + ) + break + else: + sys.exit( + u"爬虫微博可能被设置成了'不显示图片',请前往" + u"'https://weibo.cn/account/customize/pic',修改为'显示'" + ) + return picture_urls + except Exception as e: + return u"无" + print("Error: ", e) + traceback.print_exc() diff --git a/weibo_spider/parser/parser.py b/weibo_spider/parser/parser.py new file mode 100644 index 00000000..cee1a03d --- /dev/null +++ b/weibo_spider/parser/parser.py @@ -0,0 +1,5 @@ +class Parser: + def __init__(self, cookie): + self.cookie = cookie + self.url = "" + self.selector = None diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py new file mode 100644 index 00000000..aff70ad4 --- /dev/null +++ b/weibo_spider/parser/util.py @@ -0,0 +1,31 @@ +import sys +import traceback + +from lxml import etree +import requests + + +def handle_html(cookie, url): + """处理html""" + try: + html = requests.get(url, cookies=cookie).content + selector = etree.HTML(html) + return selector + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + +def handle_garbled(info): + """处理乱码""" + try: + info = ( + info.xpath("string(.)") + .replace(u"\u200b", "") + .encode(sys.stdout.encoding, "ignore") + .decode(sys.stdout.encoding) + ) + return info + except Exception as e: + print("Error: ", e) + traceback.print_exc() diff --git a/weibo_spider/printer.py b/weibo_spider/printer.py new file mode 100644 index 00000000..7f1ae262 --- /dev/null +++ b/weibo_spider/printer.py @@ -0,0 +1,24 @@ +# -*- coding: UTF-8 -*- + + +def print_one_weibo(weibo): + """打印一条微博""" + print(weibo["content"]) + print(u"微博发布位置:%s" % weibo["publish_place"]) + print(u"发布发布时间:%s" % weibo["publish_time"]) + print(u"发布发布工具:%s" % weibo["publish_tool"]) + print(u"点赞数:%d" % weibo["up_num"]) + print(u"转发数:%d" % weibo["retweet_num"]) + print(u"评论数:%d" % weibo["comment_num"]) + print(u"url:https://weibo.cn/comment/%s" % weibo["id"]) + print("-" * 100) + + +def print_user_info(user): + """打印微博用户信息""" + print(u"用户昵称: %s" % user["nickname"]) + print(u"用户id: %s" % user["id"]) + print(u"微博数: %d" % user["weibo_num"]) + print(u"关注数: %d" % user["following"]) + print(u"粉丝数: %d" % user["followers"]) + diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py new file mode 100644 index 00000000..65961149 --- /dev/null +++ b/weibo_spider/spider.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +import json +import os +import random +import sys +import traceback +from datetime import date, datetime, timedelta +from time import sleep + +from tqdm import tqdm +from absl import app, flags + +from . import datetime_util +from . import config_util +from . import printer +from .parser import IndexParser, PageParser + +FLAGS = flags.FLAGS + +flags.DEFINE_string("config_path", None, "The path to config.json.") +flags.DEFINE_string("user_id_list", None, "The path to user_id_list.txt.") +flags.DEFINE_string("output_dir", None, "The dir path to store results.") + + +class Spider: + def __init__(self, config): + """Weibo类初始化""" + self.filter = config["filter"] # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 + since_date = str(config["since_date"]) + if since_date.isdigit(): + since_date = str(date.today() - timedelta(int(since_date))) + self.since_date = since_date # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd + self.write_mode = config[ + "write_mode" + ] # 结果信息保存类型,为list形式,可包含txt、csv、json、mongo和mysql五种类型 + self.pic_download = config["pic_download"] # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 + self.video_download = config[ + "video_download" + ] # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 + self.cookie = {"Cookie": config["cookie"]} + self.mysql_config = config.get("mysql_config") # MySQL数据库连接配置,可以不填 + user_id_list = config["user_id_list"] + if not isinstance(user_id_list, list): + if FLAGS.user_id_list is not None: + user_id_list = FLAGS.user_id_list + else: + user_id_list = ( + os.path.split(os.path.realpath(__file__))[0] + os.sep + user_id_list + ) + if not os.path.isfile(user_id_list): + sys.exit(u"当前路径:%s 不存在配置文件config.json" % user_id_list) + self.user_config_file_path = user_id_list # 用户配置文件路径 + user_config_list = config_util.get_user_config_list( + user_id_list, self.since_date + ) + else: + self.user_config_file_path = "" + user_config_list = [ + {"user_uri": user_id, "since_date": self.since_date} + for user_id in user_id_list + ] + self.user_config_list = user_config_list # 要爬取的微博用户的user_config列表 + self.user_config = {} # 用户配置,包含用户id和since_date + self.start_time = "" # 获取用户第一条微博时的时间 + self.user = {} # 存储爬取到的用户信息 + self.got_num = 0 # 存储爬取到的微博数 + self.weibo = [] # 存储爬取到的所有微博信息 + self.weibo_id_list = [] # 存储爬取到的所有微博id + + def write_weibo(self, weibos): + """将爬取到的信息写入文件或数据库""" + for writer in self.writers: + writer.write_weibo(weibos) + for downloader in self.downloaders: + downloader.download_files(weibos) + + def write_user(self, user): + """将用户信息写入数据库""" + for writer in self.writers: + writer.write_user(user) + + def get_user_info(self, user_uri): + # 获取用户信息、微博数、关注数、粉丝数 + self.user = {} + user = IndexParser(self.cookie, user_uri).get_user() + for k, v in user.items(): + self.user[k] = v + + def get_weibo_info(self): + """获取微博信息""" + try: + since_date = datetime_util.str_to_time(self.user_config["since_date"]) + now = datetime.now().strftime("%Y-%m-%d %H:%M") + now = datetime.strptime(now, "%Y-%m-%d %H:%M") + if since_date <= now: + page_num = IndexParser( + self.cookie, self.user_config["user_uri"] + ).get_page_num() # 获取微博总页数 + page1 = 0 + random_pages = random.randint(1, 5) + self.start_time = datetime.now().strftime("%Y-%m-%d %H:%M") + for page in tqdm(range(1, page_num + 1), desc="Progress"): + weibos, self.weibo_id_list = PageParser( + self.cookie, self.user_config["user_uri"], page, self.filter + ).get_one_page( + self.since_date, self.weibo_id_list + ) # 获取第page页的全部微博 + print( + u"{}已获取{}({})的第{}页微博{}".format( + "-" * 30, + self.user["nickname"], + self.user["id"], + page, + "-" * 30, + ) + ) + if weibos: + yield weibos + else: + return weibos + + # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限 + # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默 + # 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间 + if (page - page1) % random_pages == 0 and page < page_num: + sleep(random.randint(6, 10)) + page1 = page + random_pages = random.randint(1, 5) + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def _get_filepath(self, type): + """获取结果文件路径""" + try: + if FLAGS.output_dir is not None: + file_dir = FLAGS.output_dir + else: + file_dir = ( + os.path.split(os.path.realpath(__file__))[0] + + os.sep + + "weibo" + + os.sep + + self.user["nickname"] + ) + if type == "img" or type == "video": + file_dir = file_dir + os.sep + type + if not os.path.isdir(file_dir): + os.makedirs(file_dir) + if type == "img" or type == "video": + return file_dir + file_path = file_dir + os.sep + self.user["id"] + "." + type + return file_path + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def initialize_info(self, user_config): + """初始化爬虫信息""" + self.got_num = 0 + self.weibo = [] + self.user_config = user_config + self.weibo_id_list = [] + + self.writers = [] + if "csv" in self.write_mode: + from .writer import CsvWriter + + self.writers.append(CsvWriter(self.filter, self._get_filepath("csv"))) + if "txt" in self.write_mode: + from .writer import TxtWriter + + self.writers.append(TxtWriter(self.filter, self._get_filepath("txt"))) + if "json" in self.write_mode: + from .writer import JsonWriter + + self.writers.append(JsonWriter(self._get_filepath("json"))) + if "mysql" in self.write_mode: + from .writer import MySqlWriter + + self.writers.append(MySqlWriter(self.mysql_config)) + if "mongo" in self.write_mode: + from .writer import MongoWriter + + self.writers.append(MongoWriter()) + + self.downloaders = [] + if self.pic_download == 1: + from .downloader import ImgDownloader + + self.downloaders.append(ImgDownloader(self._get_filepath("img"))) + if self.video_download == 1: + from .downloader import VideoDownloader + + self.downloaders.append(VideoDownloader(self._get_filepath("video"))) + + def start(self): + """运行爬虫""" + try: + for user_config in self.user_config_list: + self.get_user_info(user_config["user_uri"]) + printer.print_user_info(self.user) + print("*" * 100) + + self.initialize_info(user_config) + self.write_user(self.user) + print("*" * 100) + + for weibos in self.get_weibo_info(): + self.write_weibo(weibos) + if not self.filter: + print(u"共爬取" + str(self.got_num) + u"条微博") + else: + print(u"共爬取" + str(self.got_num) + u"条原创微博") + print(u"信息抓取完毕") + print("*" * 100) + + if self.user_config_file_path: + config_util.update_user_config_file( + self.user_config_file_path, + self.user_config["user_uri"], + self.user["nickname"], + self.start_time, + ) + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + +def main(_): + try: + if FLAGS.config_path is not None: + config_path = FLAGS.config_path + else: + config_path = ( + os.path.split(os.path.realpath(__file__))[0] + os.sep + "config.json" + ) + if not os.path.isfile(config_path): + sys.exit(u"当前路径:%s 不存在配置文件config.json" % config_path) + with open(config_path) as f: + try: + config = json.loads(f.read()) + config_util.validate_config(config) + except ValueError: + sys.exit( + u"config.json 格式不正确,请参考 " + u"https://github.com/dataabc/weiboSpider#3程序设置" + ) + + wb = Spider(config) + wb.start() # 爬取微博信息 + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + +if __name__ == "__main__": + app.run(main) diff --git a/weibo_spider/user_id_list.txt b/weibo_spider/user_id_list.txt new file mode 100644 index 00000000..7a9ac042 --- /dev/null +++ b/weibo_spider/user_id_list.txt @@ -0,0 +1 @@ +7053204102 majiko 2020-06-02 09:37 09:32 09:30 09:29 09:28 09:24 09:13 \ No newline at end of file diff --git a/weibo_spider/writer/__init__.py b/weibo_spider/writer/__init__.py new file mode 100644 index 00000000..df27490f --- /dev/null +++ b/weibo_spider/writer/__init__.py @@ -0,0 +1,7 @@ +from .csv_writer import CsvWriter +from .txt_writer import TxtWriter +from .json_writer import JsonWriter +from .mongo_writer import MongoWriter +from .mysql_writer import MySqlWriter + +__all__ = [CsvWriter, TxtWriter, JsonWriter, MongoWriter, MySqlWriter] diff --git a/weibo_spider/writer/csv_writer.py b/weibo_spider/writer/csv_writer.py new file mode 100644 index 00000000..9cd93a3a --- /dev/null +++ b/weibo_spider/writer/csv_writer.py @@ -0,0 +1,68 @@ +import sys +import codecs +import csv +import traceback + +from .writer import Writer + + +class CsvWriter(Writer): + def __init__(self, filter, file_path): + self.file_path = file_path + self.filter = filter + + def write_user(self, user): + self.user = user + + result_headers = [ + "微博id", + "微博正文", + "头条文章url", + "原始图片url", + "微博视频url", + "发布位置", + "发布时间", + "发布工具", + "点赞数", + "转发数", + "评论数", + ] + if not self.filter: + result_headers.insert(4, "被转发微博原始图片url") + result_headers.insert(5, "是否为原创微博") + try: + if sys.version < "3": # python2.x + reload(sys) + sys.setdefaultencoding("utf-8") + with open(self.file_path, "ab") as f: + f.write(codecs.BOM_UTF8) + writer = csv.writer(f) + writer.writerows([result_headers]) + else: # python3.x + with open(self.file_path, "a", encoding="utf-8-sig", newline="") as f: + writer = csv.writer(f) + writer.writerows([result_headers]) + except Exception as e: + print("Error: ", e) + traceback.print_exc() + + def write_weibo(self, weibos): + """将爬取的信息写入csv文件""" + try: + result_data = [w.values() for w in weibos] + if sys.version < "3": # python2.x + reload(sys) + sys.setdefaultencoding("utf-8") + with open(self.file_path, "ab") as f: + f.write(codecs.BOM_UTF8) + writer = csv.writer(f) + writer.writerows(result_data) + else: # python3.x + with open(self.file_path, "a", encoding="utf-8-sig", newline="") as f: + writer = csv.writer(f) + writer.writerows(result_data) + print(u"%d条微博写入csv文件完毕,保存路径:" % len(weibos)) + print(self.file_path) + except Exception as e: + print("Error: ", e) + traceback.print_exc() diff --git a/weibo_spider/writer/json_writer.py b/weibo_spider/writer/json_writer.py new file mode 100644 index 00000000..7b9159e5 --- /dev/null +++ b/weibo_spider/writer/json_writer.py @@ -0,0 +1,50 @@ +import codecs +import json +import os + +from .writer import Writer + + +class JsonWriter(Writer): + def __init__(self, file_path): + self.file_path = file_path + + def write_user(self, user): + self.user = user + + def _update_json_data(self, data, weibo_info): + """更新要写入json结果文件中的数据,已经存在于json中的信息更新为最新值,不存在的信息添加到data中""" + data["user"] = self.user + if data.get("weibo"): + is_new = 1 # 待写入微博是否全部为新微博,即待写入微博与json中的数据不重复 + for old in data["weibo"]: + if weibo_info[-1]["id"] == old["id"]: + is_new = 0 + break + if is_new == 0: + for new in weibo_info: + flag = 1 + for i, old in enumerate(data["weibo"]): + if new["id"] == old["id"]: + data["weibo"][i] = new + flag = 0 + break + if flag: + data["weibo"].append(new) + else: + data["weibo"] += weibo_info + else: + data["weibo"] = weibo_info + return data + + def write_weibo(self, weibos): + """将爬到的信息写入json文件""" + data = {} + if os.path.isfile(self.file_path): + with codecs.open(self.file_path, "r", encoding="utf-8") as f: + data = json.load(f) + data = self._update_json_data(data, weibos) + with codecs.open(self.file_path, "w", encoding="utf-8") as f: + f.write(json.dumps(data, indent=4, ensure_ascii=False)) + print(u"%d条微博写入json文件完毕,保存路径:" % len(weibos)) + print(self.file_path) diff --git a/weibo_spider/writer/mongo_writer.py b/weibo_spider/writer/mongo_writer.py new file mode 100644 index 00000000..ded49366 --- /dev/null +++ b/weibo_spider/writer/mongo_writer.py @@ -0,0 +1,46 @@ +import copy +import sys + +from .writer import Writer + + +class MongoWriter(Writer): + def __init__(self): + pass + + def _info_to_mongodb(self, collection, info_list): + """将爬取的信息写入MongoDB数据库""" + try: + import pymongo + except ImportError: + sys.exit(u"系统中可能没有安装pymongo库,请先运行 pip install pymongo ,再运行程序") + try: + from pymongo import MongoClient + + client = MongoClient() + db = client["weibo"] + collection = db[collection] + new_info_list = copy.deepcopy(info_list) + for info in new_info_list: + if not collection.find_one({"id": info["id"]}): + collection.insert_one(info) + else: + collection.update_one({"id": info["id"]}, {"$set": info}) + except pymongo.errors.ServerSelectionTimeoutError: + sys.exit(u"系统中可能没有安装或启动MongoDB数据库,请先根据系统环境安装或启动MongoDB,再运行程序") + + def write_weibo(self, weibos): + """将爬取的微博信息写入MongoDB数据库""" + weibo_list = [] + for w in weibos: + w["user_id"] = self.user["id"] + weibo_list.append(w) + self._info_to_mongodb("weibo", weibo_list) + print(u"%d条微博写入MongoDB数据库完毕" % len(weibos)) + + def write_user(self, user): + """将爬取的用户信息写入MongoDB数据库""" + self.user = user + user_list = [user] + self._info_to_mongodb("user", user_list) + print(u"%s信息写入MongoDB数据库完毕" % user["nickname"]) diff --git a/weibo_spider/writer/mysql_writer.py b/weibo_spider/writer/mysql_writer.py new file mode 100644 index 00000000..2d293e33 --- /dev/null +++ b/weibo_spider/writer/mysql_writer.py @@ -0,0 +1,126 @@ +import copy +import sys +import traceback + +try: + import pymysql +except ImportError: + sys.exit(u"系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序") + + +from .writer import Writer + + +class MySqlWriter(Writer): + def __init__(self, mysql_config): + self.mysql_config = mysql_config + + def _mysql_create(self, connection, sql): + """创建MySQL数据库或表""" + try: + with connection.cursor() as cursor: + cursor.execute(sql) + finally: + connection.close() + + def _mysql_create_database(self, sql): + """创建MySQL数据库""" + try: + print(self.mysql_config, sql) + connection = pymysql.connect(**self.mysql_config) + self._mysql_create(connection, sql) + except pymysql.OperationalError: + sys.exit(u"系统中可能没有安装或正确配置MySQL数据库,请先根据系统环境安装或配置MySQL,再运行程序") + + def _mysql_create_table(self, sql): + """创建MySQL表""" + self.mysql_config["db"] = "weibo" + connection = pymysql.connect(**self.mysql_config) + self._mysql_create(connection, sql) + + def _mysql_insert(self, table, data_list): + """向MySQL表插入或更新数据""" + if len(data_list) > 0: + keys = ", ".join(data_list[0].keys()) + values = ", ".join(["%s"] * len(data_list[0])) + self.mysql_config["db"] = "weibo" + connection = pymysql.connect(**self.mysql_config) + cursor = connection.cursor() + sql = """INSERT INTO {table}({keys}) VALUES ({values}) ON + DUPLICATE KEY UPDATE""".format( + table=table, keys=keys, values=values + ) + update = ",".join( + [" {key} = values({key})".format(key=key) for key in data_list[0]] + ) + sql += update + try: + cursor.executemany(sql, [tuple(data.values()) for data in data_list]) + connection.commit() + except Exception as e: + connection.rollback() + print("Error: ", e) + traceback.print_exc() + finally: + connection.close() + + def write_weibo(self, weibos): + """将爬取的微博信息写入MySQL数据库""" + # 创建'weibo'表 + create_table = """ + CREATE TABLE IF NOT EXISTS weibo ( + id varchar(10) NOT NULL, + user_id varchar(12), + content varchar(2000), + article_url varchar(200), + original_pictures varchar(3000), + retweet_pictures varchar(3000), + original BOOLEAN NOT NULL DEFAULT 1, + video_url varchar(300), + publish_place varchar(100), + publish_time DATETIME NOT NULL, + publish_tool varchar(30), + up_num INT NOT NULL, + retweet_num INT NOT NULL, + comment_num INT NOT NULL, + PRIMARY KEY (id) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" + self._mysql_create_table(create_table) + # 在'weibo'表中插入或更新微博数据 + weibo_list = [] + info_list = copy.deepcopy(weibos) + for weibo in info_list: + weibo["user_id"] = self.user["id"] + weibo_list.append(weibo) + self._mysql_insert("weibo", weibo_list) + print(u"%d条微博写入MySQL数据库完毕" % len(weibos)) + + def write_user(self, user): + """将爬取的用户信息写入MySQL数据库""" + self.user = user + + # 创建'weibo'数据库 + create_database = """CREATE DATABASE IF NOT EXISTS weibo DEFAULT + CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci""" + self._mysql_create_database(create_database) + # 创建'user'表 + create_table = """ + CREATE TABLE IF NOT EXISTS user ( + id varchar(20) NOT NULL, + nickname varchar(30), + gender varchar(10), + location varchar(200), + birthday varchar(40), + description varchar(140), + verified_reason varchar(140), + talent varchar(200), + education varchar(200), + work varchar(200), + weibo_num INT, + following INT, + followers INT, + PRIMARY KEY (id) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" + self._mysql_create_table(create_table) + self._mysql_insert("user", [user]) + print(u"%s信息写入MySQL数据库完毕" % user["nickname"]) diff --git a/weibo_spider/writer/txt_writer.py b/weibo_spider/writer/txt_writer.py new file mode 100644 index 00000000..195b2cfa --- /dev/null +++ b/weibo_spider/writer/txt_writer.py @@ -0,0 +1,67 @@ +import sys +import traceback + +from .writer import Writer + + +class TxtWriter(Writer): + def __init__(self, filter, file_path): + self.filter = filter + self.file_path = file_path + + def write_user(self, user): + self.user = user + if self.filter: + result_header = u"\n\n原创微博内容: \n" + else: + result_header = u"\n\n微博内容: \n" + result_header = ( + u"用户信息\n用户昵称:" + + self.user["nickname"] + + u"\n用户id: " + + str(self.user["id"]) + + u"\n微博数: " + + str(self.user["weibo_num"]) + + u"\n关注数: " + + str(self.user["following"]) + + u"\n粉丝数: " + + str(self.user["followers"]) + + result_header + ) + + with open(self.file_path, "ab") as f: + f.write(result_header.encode(sys.stdout.encoding)) + + def write_weibo(self, weibo): + """将爬取的信息写入txt文件""" + try: + temp_result = [] + for i, w in enumerate(weibo): + temp_result.append( + w["content"] + + "\n" + + u"微博位置: " + + w["publish_place"] + + "\n" + + u"发布时间: " + + w["publish_time"] + + "\n" + + u"点赞数: " + + str(w["up_num"]) + + u" 转发数: " + + str(w["retweet_num"]) + + u" 评论数: " + + str(w["comment_num"]) + + "\n" + + u"发布工具: " + + w["publish_tool"] + + "\n\n" + ) + result = "".join(temp_result) + with open(self.file_path, "ab") as f: + f.write(result.encode(sys.stdout.encoding)) + print(u"%d条微博写入txt文件完毕,保存路径:" % len(weibo)) + print(self.file_path) + except Exception as e: + print("Error: ", e) + traceback.print_exc() diff --git a/weibo_spider/writer/writer.py b/weibo_spider/writer/writer.py new file mode 100644 index 00000000..c05846a2 --- /dev/null +++ b/weibo_spider/writer/writer.py @@ -0,0 +1,9 @@ +class Writer: + def __init__(self, config): + pass + + def write_weibo(self, weibo): + pass + + def write_user(self, user): + pass diff --git a/writer.py b/writer.py deleted file mode 100644 index c028816c..00000000 --- a/writer.py +++ /dev/null @@ -1,317 +0,0 @@ -# -*- coding: UTF-8 -*- - -import copy -import csv -import os -import sys -import traceback - - -def get_filepath(type, nickname): - """获取结果文件路径""" - file_dir = os.path.split( - os.path.realpath(__file__))[0] + os.sep + 'weibo' + os.sep + nickname - if type == 'img' or type == 'video': - file_dir = file_dir + os.sep + type - if not os.path.isdir(file_dir): - os.makedirs(file_dir) - if type == 'img' or type == 'video': - return file_dir - file_path = file_dir + os.sep + nickname + '.' + type - return file_path - - -def write_log(since_date): - """当程序因cookie过期停止运行时,将相关信息写入log.txt""" - file_dir = os.path.split( - os.path.realpath(__file__))[0] + os.sep + 'weibo' + os.sep - if not os.path.isdir(file_dir): - os.makedirs(file_dir) - file_path = file_dir + 'log.txt' - content = u'cookie已过期,从%s到今天的微博获取失败,请重新设置cookie\n' % since_date - with open(file_path, 'ab') as f: - f.write(content.encode(sys.stdout.encoding)) - - -class Writer: - def __init__(self, config): - write_mode = config['write_mode'] - self.writers = [] - - if 'txt' in write_mode: - self.writers.append(TxtWriter(config)) - if 'csv' in write_mode: - self.writers.append(CsvWriter(config)) - if 'mysql' in write_mode: - self.writers.append(MysqlWriter(config)) - if 'mongo' in write_mode: - self.writers.append(MongoWriter(config)) - - def write_user(self, user): - for writer in self.writers: - if isinstance(writer, MongoWriter): - writer.write_user(copy.deepcopy(user)) - else: - writer.write_user(user) - - def write_weibo(self, weibo): - for writer in self.writers: - if isinstance(writer, MongoWriter) or isinstance( - writer, MysqlWriter): - writer.write_weibo(copy.deepcopy(weibo)) - else: - writer.write_weibo(weibo) - - -class TxtWriter: - def __init__(self, config): - self.config = config - - def write_user(self, user): - self.user = user - if self.config['filter']: - result_header = u'\n\n原创微博内容: \n' - else: - result_header = u'\n\n微博内容: \n' - result_header = (u'用户信息\n用户昵称:' + user['nickname'] + u'\n用户id: ' + - str(user['id']) + u'\n微博数: ' + - str(user['weibo_num']) + u'\n关注数: ' + - str(user['following']) + u'\n粉丝数: ' + - str(user['followers']) + result_header) - - with open(get_filepath('txt', user['nickname']), 'ab') as f: - f.write(result_header.encode(sys.stdout.encoding)) - - def write_weibo(self, weibo): - """将爬取的信息写入txt文件""" - - temp_result = [] - for w in weibo: - temp_result.append(w['content'] + '\n' + u'微博位置: ' + - w['publish_place'] + '\n' + u'发布时间: ' + - w['publish_time'] + '\n' + u'点赞数: ' + - str(w['up_num']) + u' 转发数: ' + - str(w['retweet_num']) + u' 评论数: ' + - str(w['comment_num']) + '\n' + u'发布工具: ' + - w['publish_tool'] + '\n\n') - result = ''.join(temp_result) - with open(get_filepath('txt', self.user['nickname']), 'ab') as f: - f.write(result.encode(sys.stdout.encoding)) - print(u'%d条微博写入txt文件完毕,保存路径:' % len(weibo)) - print(get_filepath('txt', self.user['nickname'])) - - -class CsvWriter: - def __init__(self, config): - self.config = config - - def write_user(self, user): - self.user = user - result_headers = [ - '微博id', - '微博正文', - '发布位置', - '发布时间', - '发布工具', - '点赞数', - '转发数', - '评论数', - '原始图片url', - '微博视频url', - ] - if not self.config['filter']: - result_headers.insert(-1, '被转发微博原始图片url') - result_headers.insert(-1, '是否为原创微博') - - if sys.version < '3': # python2.x - reload(sys) - sys.setdefaultencoding('utf-8') - with open(get_filepath('csv', self.user['nickname']), 'ab') as f: - csv_writer = csv.writer(f) - csv_writer.writerows([result_headers]) - else: # python3.x - with open(get_filepath('csv', self.user['nickname']), - 'a', - encoding='utf-8-sig', - newline='') as f: - csv_writer = csv.writer(f) - csv_writer.writerows([result_headers]) - - def write_weibo(self, weibo): - """将爬取的信息写入csv文件""" - result_data = [w.values() for w in weibo] - - if sys.version < '3': # python2.x - reload(sys) - sys.setdefaultencoding('utf-8') - with open(get_filepath('csv', self.user['nickname']), 'ab') as f: - csv_writer = csv.writer(f) - csv_writer.writerows(result_data) - else: # python3.x - with open(get_filepath('csv', self.user['nickname']), - 'a', - encoding='utf-8-sig', - newline='') as f: - csv_writer = csv.writer(f) - csv_writer.writerows(result_data) - - print(u'%d条微博写入csv文件完毕,保存路径:' % len(weibo)) - print(get_filepath('csv', self.user['nickname'])) - - -class MongoWriter: - def __init__(self, config): - self.config = config - - def info_to_mongodb(self, collection, info_list): - """将爬取的信息写入MongoDB数据库""" - try: - import pymongo - from pymongo import MongoClient - except ImportError: - sys.exit(u'系统中可能没有安装pymongo库,请先运行 pip install pymongo ,再运行程序') - - try: - client = MongoClient() - except pymongo.errors.ServerSelectionTimeoutError: - sys.exit(u'系统中可能没有安装或启动MongoDB数据库,请先根据系统环境安装或启动MongoDB,再运行程序') - - db = client['weibo'] - collection = db[collection] - for info in info_list: - if not collection.find_one({'id': info['id']}): - collection.insert_one(info) - else: - collection.update_one({'id': info['id']}, {'$set': info}) - - def write_user(self, user): - """将爬取的用户信息写入MongoDB数据库""" - self.user = user - - user_list = [user] - self.info_to_mongodb('user', user_list) - print(u'%s信息写入MongoDB数据库完毕' % user['nickname']) - - def write_weibo(self, weibo): - """将爬取的微博信息写入MongoDB数据库""" - weibo_list = [] - for w in weibo: - w['user_id'] = self.user['id'] - weibo_list.append(w) - self.info_to_mongodb('weibo', weibo_list) - print(u'%d条微博写入MongoDB数据库完毕' % len(weibo)) - - -class MysqlWriter: - def __init__(self, config): - self.config = config - - def write_user(self, user): - """将爬取的用户信息写入MySQL数据库""" - self.user = user - # 创建'weibo'数据库 - create_database = """CREATE DATABASE IF NOT EXISTS weibo DEFAULT - CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci""" - self.mysql_create_database(create_database) - # 创建'user'表 - create_table = """ - CREATE TABLE IF NOT EXISTS user ( - id varchar(12) NOT NULL, - nickname varchar(30), - weibo_num INT, - following INT, - followers INT, - PRIMARY KEY (id) - ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" - self.mysql_create_table(create_table) - self.mysql_insert('user', [user]) - print(u'%s信息写入MySQL数据库完毕' % user['nickname']) - - def write_weibo(self, weibo): - """将爬取的微博信息写入MySQL数据库""" - # 创建'weibo'表 - create_table = """ - CREATE TABLE IF NOT EXISTS weibo ( - id varchar(10) NOT NULL, - user_id varchar(12), - content varchar(2000), - original_pictures varchar(1000), - retweet_pictures varchar(1000), - original BOOLEAN NOT NULL DEFAULT 1, - video_url varchar(300), - publish_place varchar(100), - publish_time DATETIME NOT NULL, - publish_tool varchar(30), - up_num INT NOT NULL, - retweet_num INT NOT NULL, - comment_num INT NOT NULL, - PRIMARY KEY (id) - ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" - self.mysql_create_table(create_table) - # 在'weibo'表中插入或更新微博数据 - weibo_list = [] - for w in weibo: - w['user_id'] = self.user['id'] - weibo_list.append(w) - self.mysql_insert('weibo', weibo_list) - print(u'%d条微博写入MySQL数据库完毕' % len(weibo)) - - def mysql_create(self, connection, sql): - """创建MySQL数据库或表""" - try: - with connection.cursor() as cursor: - cursor.execute(sql) - finally: - connection.close() - - def mysql_create_database(self, sql): - """创建MySQL数据库""" - try: - import pymysql - except ImportError: - sys.exit(u'系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序') - mysql_config = self.config['mysql_config'] - try: - connection = pymysql.connect(**mysql_config) - except pymysql.err.OperationalError: - sys.exit(u'系统中可能没有安装或启动MySQL数据库或配置错误,请先根据系统环境安装或启动MySQL,再运行程序') - self.mysql_create(connection, sql) - - def mysql_create_table(self, sql): - """创建MySQL表""" - import pymysql - mysql_config = self.config['mysql_config'] - mysql_config['db'] = 'weibo' - connection = pymysql.connect(**mysql_config) - self.mysql_create(connection, sql) - - def mysql_insert(self, table, data_list): - """向MySQL表插入或更新数据""" - import pymysql - mysql_config = self.config['mysql_config'] - - if len(data_list) > 0: - keys = ', '.join(data_list[0].keys()) - values = ', '.join(['%s'] * len(data_list[0])) - mysql_config['db'] = 'weibo' - connection = pymysql.connect(**mysql_config) - cursor = connection.cursor() - sql = """INSERT INTO {table}({keys}) VALUES ({values}) ON - DUPLICATE KEY UPDATE""".format(table=table, - keys=keys, - values=values) - update = ','.join([ - " {key} = values({key})".format(key=key) - for key in data_list[0] - ]) - sql += update - try: - cursor.executemany( - sql, [tuple(data.values()) for data in data_list]) - connection.commit() - except Exception as e: - connection.rollback() - print('Error: ', e) - traceback.print_exc() - finally: - connection.close() From ab4559cef09cead8f6aec5f9ca0cfc141c47a680 Mon Sep 17 00:00:00 2001 From: dataabc Date: Tue, 2 Jun 2020 21:37:25 +0800 Subject: [PATCH 167/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E7=94=A8?= =?UTF-8?q?=E6=88=B7=E6=8F=90=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/weiboSpider.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/weibo_spider/weiboSpider.py b/weibo_spider/weiboSpider.py index 05b2ae8a..4ed706fc 100644 --- a/weibo_spider/weiboSpider.py +++ b/weibo_spider/weiboSpider.py @@ -1159,14 +1159,16 @@ def get_config(): config_path = FLAGS.config_path elif not os.path.isfile(config_path): shutil.copy(src, config_path) - sys.exit(u'请先配置当前目录(%s)下的config.json文件' % os.getcwd()) + sys.exit(u'请先配置当前目录(%s)下的config.json文件,' + u'如果想了解config.json参数的具体意义及配置方法,请访问\n' + u'https://github.com/dataabc/weiboSpider#2程序设置' % os.getcwd()) try: with open(config_path) as f: config = json.loads(f.read()) return config except ValueError: - sys.exit(u'config.json 格式不正确,请参考 ' - u'https://github.com/dataabc/weiboSpider#3程序设置') + sys.exit(u'config.json 格式不正确,请访问 ' + u'https://github.com/dataabc/weiboSpider#2程序设置') def main(argv): From 592496b3e8d64d79a75f27dd55b6420e153a4e74 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Tue, 2 Jun 2020 23:23:49 +0800 Subject: [PATCH 168/363] serveral bug fix --- .gitignore | 2 +- weibo_spider/__init__,py | 0 weibo_spider/__main__.py | 2 +- .../{config.json => config_sample.json} | 0 weibo_spider/{spider.py => weiboSpider.py} | 50 +++++++++---------- 5 files changed, 26 insertions(+), 28 deletions(-) delete mode 100644 weibo_spider/__init__,py rename weibo_spider/{config.json => config_sample.json} (100%) rename weibo_spider/{spider.py => weiboSpider.py} (89%) diff --git a/.gitignore b/.gitignore index 71f03014..78d8e9a8 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,4 @@ dist/ *.egg-info weibo/ -config_sample.json +config.json diff --git a/weibo_spider/__init__,py b/weibo_spider/__init__,py deleted file mode 100644 index e69de29b..00000000 diff --git a/weibo_spider/__main__.py b/weibo_spider/__main__.py index d7d8fc71..f562cfae 100644 --- a/weibo_spider/__main__.py +++ b/weibo_spider/__main__.py @@ -1,4 +1,4 @@ -from .spider import main +from .weiboSpider import main from absl import app diff --git a/weibo_spider/config.json b/weibo_spider/config_sample.json similarity index 100% rename from weibo_spider/config.json rename to weibo_spider/config_sample.json diff --git a/weibo_spider/spider.py b/weibo_spider/weiboSpider.py similarity index 89% rename from weibo_spider/spider.py rename to weibo_spider/weiboSpider.py index 65961149..eb0d4d22 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/weiboSpider.py @@ -4,6 +4,7 @@ import json import os import random +import shutil import sys import traceback from datetime import date, datetime, timedelta @@ -46,9 +47,7 @@ def __init__(self, config): if FLAGS.user_id_list is not None: user_id_list = FLAGS.user_id_list else: - user_id_list = ( - os.path.split(os.path.realpath(__file__))[0] + os.sep + user_id_list - ) + user_id_list = os.getcwd() + os.sep + user_id_list if not os.path.isfile(user_id_list): sys.exit(u"当前路径:%s 不存在配置文件config.json" % user_id_list) self.user_config_file_path = user_id_list # 用户配置文件路径 @@ -139,11 +138,7 @@ def _get_filepath(self, type): file_dir = FLAGS.output_dir else: file_dir = ( - os.path.split(os.path.realpath(__file__))[0] - + os.sep - + "weibo" - + os.sep - + self.user["nickname"] + os.getcwd() + os.sep + "weibo" + os.sep + self.user["nickname"] ) if type == "img" or type == "video": file_dir = file_dir + os.sep + type @@ -229,26 +224,29 @@ def start(self): traceback.print_exc() -def main(_): +def _get_config(): + """获取config.json数据""" + src = os.path.split(os.path.realpath(__file__))[0] + os.sep + 'config_sample.json' + config_path = os.getcwd() + os.sep + 'config.json' + if FLAGS.config_path: + config_path = FLAGS.config_path + elif not os.path.isfile(config_path): + shutil.copy(src, config_path) + sys.exit(u'请先配置当前目录(%s)下的config.json文件,' + u'如果想了解config.json参数的具体意义及配置方法,请访问\n' + u'https://github.com/dataabc/weiboSpider#2程序设置' % os.getcwd()) try: - if FLAGS.config_path is not None: - config_path = FLAGS.config_path - else: - config_path = ( - os.path.split(os.path.realpath(__file__))[0] + os.sep + "config.json" - ) - if not os.path.isfile(config_path): - sys.exit(u"当前路径:%s 不存在配置文件config.json" % config_path) with open(config_path) as f: - try: - config = json.loads(f.read()) - config_util.validate_config(config) - except ValueError: - sys.exit( - u"config.json 格式不正确,请参考 " - u"https://github.com/dataabc/weiboSpider#3程序设置" - ) + config = json.loads(f.read()) + return config + except ValueError: + sys.exit(u'config.json 格式不正确,请访问 ' + u'https://github.com/dataabc/weiboSpider#2程序设置') + +def main(_): + try: + config = _get_config() wb = Spider(config) wb.start() # 爬取微博信息 except Exception as e: @@ -257,4 +255,4 @@ def main(_): if __name__ == "__main__": - app.run(main) + app.run(main) \ No newline at end of file From 7f0a82cb11eba496f2c6cc2e027c1271142ded43 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sat, 6 Jun 2020 17:52:03 +0800 Subject: [PATCH 169/363] Update README.md --- README.md | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index fd9893da..6fe59d14 100644 --- a/README.md +++ b/README.md @@ -95,7 +95,7 @@ cookie修改完成后运行weiboSpider.py,该文件位于weibospider=>weibo_spider: ```bash -$ python weiboSpider.py +$ python3 weiboSpider.py ``` 程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个json文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](#3设置数据库可选)部分。

@@ -168,18 +168,13 @@ json文件包含迪丽热巴的用户信息和上千条微博信息,内容较 ## 使用说明 ### 0.版本 -本程序有两个版本,**功能完成一样**。你现在看到的是单文件版,另一个是多文件版,[多文件版](https://github.com/dataabc/weiboSpider/tree/multi-file)位于multi-file分支。
-二者的区别在于: ->单文件版是所有代码都写到一个文件里,即[weiboSpider.py](https://github.com/dataabc/weiboSpider/blob/master/weiboSpider.py)。多文件版重构了单文件版,按照代码功能分成了几个文件,代码更清晰,更易读。如果你仅仅想使用程序,这两个版本用哪一个都一样;如果你不仅想使用,还想开发新功能,多文件版可能更容易。 - -多文件版由[songzy12](https://github.com/songzy12)重构。songzy12非常认真负责,对于我发现的问题都很耐心地修复了,而且效率非常高,在此感谢。
-本使用说明是单文件版的使用说明。 +本程序有两个版本,你现在看到的是python3版,另一个是python2版,python2版位于[python2分支](https://github.com/dataabc/weiboSpider/tree/python2)。目前主力开发python3版,包括新功能开发和bug修复;python2版仅支持bug修复。推荐python3用户使用当前版本,推荐python2用户使用[python2版](https://github.com/dataabc/weiboSpider/tree/python2),本使用说明是python3版的使用说明。
### 1.下载脚本 本程序提供两种下载方式,一种是**源码下载安装**,另一种是**pip安装**,二者功能完全相同。如果你需要修改源码,建议使用第一种方式,否则选哪种安装方式都可以。
**源码下载安装**
下载脚本 ```bash -$ git clone -b multi-file https://github.com/dataabc/weibospider.git +$ git clone https://github.com/dataabc/weibospider.git ``` 安装依赖 ```bash @@ -319,7 +314,7 @@ MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为 ### 4.运行脚本 **源码下载安装**的用户可以在weiboSpider.py文件所在目录下运行 ```bash -$ python weiboSpider.py +$ python3 weiboSpider.py ``` **pip安装**的用户可以在任意有写权限的目录运行 ```bash From 837e616cdcad114681fc74d1d78b56c7379dcac2 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sat, 6 Jun 2020 21:31:37 +0800 Subject: [PATCH 170/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=A8=A1?= =?UTF-8?q?=E5=9D=97=E8=B7=AF=E5=BE=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../{config_sample.json => config.json} | 2 +- weibo_spider/config_util.py | 15 ++- weibo_spider/weiboSpider.py | 97 ++++++++-------- .../{parser => weibo_parser}/__init__.py | 0 .../comment_parser.py | 0 .../{parser => weibo_parser}/index_parser.py | 0 .../{parser => weibo_parser}/info_parser.py | 0 .../{parser => weibo_parser}/page_parser.py | 104 +++++++++--------- .../{parser => weibo_parser}/parser.py | 0 weibo_spider/{parser => weibo_parser}/util.py | 0 10 files changed, 111 insertions(+), 107 deletions(-) rename weibo_spider/{config_sample.json => config.json} (99%) rename weibo_spider/{parser => weibo_parser}/__init__.py (100%) rename weibo_spider/{parser => weibo_parser}/comment_parser.py (100%) rename weibo_spider/{parser => weibo_parser}/index_parser.py (100%) rename weibo_spider/{parser => weibo_parser}/info_parser.py (100%) rename weibo_spider/{parser => weibo_parser}/page_parser.py (84%) rename weibo_spider/{parser => weibo_parser}/parser.py (100%) rename weibo_spider/{parser => weibo_parser}/util.py (100%) diff --git a/weibo_spider/config_sample.json b/weibo_spider/config.json similarity index 99% rename from weibo_spider/config_sample.json rename to weibo_spider/config.json index 04db0f39..88b06773 100644 --- a/weibo_spider/config_sample.json +++ b/weibo_spider/config.json @@ -13,4 +13,4 @@ "password": "123456", "charset": "utf8mb4" } -} +} \ No newline at end of file diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 7eb5b199..12f6f2b8 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -33,17 +33,19 @@ def validate_config(config): sys.exit(u"write_mode值应为list类型") for mode in config["write_mode"]: if mode not in write_mode: - sys.exit(u"%s为无效模式,请从txt、csv、json、mongo和mysql中挑选一个或多个作为write_mode" % mode) + sys.exit( + u"%s为无效模式,请从txt、csv、json、mongo和mysql中挑选一个或多个作为write_mode" % + mode) # 验证user_id_list user_id_list = config["user_id_list"] - if (not isinstance(user_id_list, list)) and (not user_id_list.endswith(".txt")): + if (not isinstance(user_id_list, + list)) and (not user_id_list.endswith(".txt")): sys.exit(u"user_id_list值应为list类型或txt文件路径") if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): - user_id_list = ( - os.path.split(os.path.realpath(__file__))[0] + os.sep + user_id_list - ) + user_id_list = (os.path.split(os.path.realpath(__file__))[0] + + os.sep + user_id_list) if not os.path.isfile(user_id_list): sys.exit(u"不存在%s文件" % user_id_list) @@ -74,7 +76,8 @@ def get_user_config_list(file_name, default_since_date): return user_config_list -def update_user_config_file(user_config_file_path, user_uri, nickname, start_time): +def update_user_config_file(user_config_file_path, user_uri, nickname, + start_time): """更新用户配置文件""" with open(user_config_file_path, "rb") as f: lines = f.read().splitlines() diff --git a/weibo_spider/weiboSpider.py b/weibo_spider/weiboSpider.py index eb0d4d22..ab677b66 100644 --- a/weibo_spider/weiboSpider.py +++ b/weibo_spider/weiboSpider.py @@ -10,13 +10,13 @@ from datetime import date, datetime, timedelta from time import sleep -from tqdm import tqdm from absl import app, flags +from tqdm import tqdm -from . import datetime_util -from . import config_util -from . import printer -from .parser import IndexParser, PageParser +import config_util +import datetime_util +import printer +from weibo_parser import IndexParser, PageParser FLAGS = flags.FLAGS @@ -28,18 +28,18 @@ class Spider: def __init__(self, config): """Weibo类初始化""" - self.filter = config["filter"] # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 + self.filter = config[ + "filter"] # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 since_date = str(config["since_date"]) if since_date.isdigit(): since_date = str(date.today() - timedelta(int(since_date))) self.since_date = since_date # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd self.write_mode = config[ - "write_mode" - ] # 结果信息保存类型,为list形式,可包含txt、csv、json、mongo和mysql五种类型 - self.pic_download = config["pic_download"] # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 + "write_mode"] # 结果信息保存类型,为list形式,可包含txt、csv、json、mongo和mysql五种类型 + self.pic_download = config[ + "pic_download"] # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 self.video_download = config[ - "video_download" - ] # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 + "video_download"] # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 self.cookie = {"Cookie": config["cookie"]} self.mysql_config = config.get("mysql_config") # MySQL数据库连接配置,可以不填 user_id_list = config["user_id_list"] @@ -52,14 +52,13 @@ def __init__(self, config): sys.exit(u"当前路径:%s 不存在配置文件config.json" % user_id_list) self.user_config_file_path = user_id_list # 用户配置文件路径 user_config_list = config_util.get_user_config_list( - user_id_list, self.since_date - ) + user_id_list, self.since_date) else: self.user_config_file_path = "" - user_config_list = [ - {"user_uri": user_id, "since_date": self.since_date} - for user_id in user_id_list - ] + user_config_list = [{ + "user_uri": user_id, + "since_date": self.since_date + } for user_id in user_id_list] self.user_config_list = user_config_list # 要爬取的微博用户的user_config列表 self.user_config = {} # 用户配置,包含用户id和since_date self.start_time = "" # 获取用户第一条微博时的时间 @@ -90,31 +89,30 @@ def get_user_info(self, user_uri): def get_weibo_info(self): """获取微博信息""" try: - since_date = datetime_util.str_to_time(self.user_config["since_date"]) + since_date = datetime_util.str_to_time( + self.user_config["since_date"]) now = datetime.now().strftime("%Y-%m-%d %H:%M") now = datetime.strptime(now, "%Y-%m-%d %H:%M") if since_date <= now: page_num = IndexParser( - self.cookie, self.user_config["user_uri"] - ).get_page_num() # 获取微博总页数 + self.cookie, + self.user_config["user_uri"]).get_page_num() # 获取微博总页数 page1 = 0 random_pages = random.randint(1, 5) self.start_time = datetime.now().strftime("%Y-%m-%d %H:%M") for page in tqdm(range(1, page_num + 1), desc="Progress"): weibos, self.weibo_id_list = PageParser( - self.cookie, self.user_config["user_uri"], page, self.filter - ).get_one_page( - self.since_date, self.weibo_id_list - ) # 获取第page页的全部微博 - print( - u"{}已获取{}({})的第{}页微博{}".format( - "-" * 30, - self.user["nickname"], - self.user["id"], - page, - "-" * 30, - ) - ) + self.cookie, self.user_config["user_uri"], + page, self.filter).get_one_page( + self.since_date, + self.weibo_id_list) # 获取第page页的全部微博 + print(u"{}已获取{}({})的第{}页微博{}".format( + "-" * 30, + self.user["nickname"], + self.user["id"], + page, + "-" * 30, + )) if weibos: yield weibos else: @@ -137,9 +135,8 @@ def _get_filepath(self, type): if FLAGS.output_dir is not None: file_dir = FLAGS.output_dir else: - file_dir = ( - os.getcwd() + os.sep + "weibo" + os.sep + self.user["nickname"] - ) + file_dir = (os.getcwd() + os.sep + "weibo" + os.sep + + self.user["nickname"]) if type == "img" or type == "video": file_dir = file_dir + os.sep + type if not os.path.isdir(file_dir): @@ -161,35 +158,38 @@ def initialize_info(self, user_config): self.writers = [] if "csv" in self.write_mode: - from .writer import CsvWriter + from writer import CsvWriter - self.writers.append(CsvWriter(self.filter, self._get_filepath("csv"))) + self.writers.append( + CsvWriter(self.filter, self._get_filepath("csv"))) if "txt" in self.write_mode: - from .writer import TxtWriter + from writer import TxtWriter - self.writers.append(TxtWriter(self.filter, self._get_filepath("txt"))) + self.writers.append( + TxtWriter(self.filter, self._get_filepath("txt"))) if "json" in self.write_mode: - from .writer import JsonWriter + from writer import JsonWriter self.writers.append(JsonWriter(self._get_filepath("json"))) if "mysql" in self.write_mode: - from .writer import MySqlWriter + from writer import MySqlWriter self.writers.append(MySqlWriter(self.mysql_config)) if "mongo" in self.write_mode: - from .writer import MongoWriter + from writer import MongoWriter self.writers.append(MongoWriter()) self.downloaders = [] if self.pic_download == 1: - from .downloader import ImgDownloader + from downloader import ImgDownloader self.downloaders.append(ImgDownloader(self._get_filepath("img"))) if self.video_download == 1: - from .downloader import VideoDownloader + from downloader import VideoDownloader - self.downloaders.append(VideoDownloader(self._get_filepath("video"))) + self.downloaders.append( + VideoDownloader(self._get_filepath("video"))) def start(self): """运行爬虫""" @@ -226,7 +226,8 @@ def start(self): def _get_config(): """获取config.json数据""" - src = os.path.split(os.path.realpath(__file__))[0] + os.sep + 'config_sample.json' + src = os.path.split( + os.path.realpath(__file__))[0] + os.sep + 'config_sample.json' config_path = os.getcwd() + os.sep + 'config.json' if FLAGS.config_path: config_path = FLAGS.config_path @@ -255,4 +256,4 @@ def main(_): if __name__ == "__main__": - app.run(main) \ No newline at end of file + app.run(main) diff --git a/weibo_spider/parser/__init__.py b/weibo_spider/weibo_parser/__init__.py similarity index 100% rename from weibo_spider/parser/__init__.py rename to weibo_spider/weibo_parser/__init__.py diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/weibo_parser/comment_parser.py similarity index 100% rename from weibo_spider/parser/comment_parser.py rename to weibo_spider/weibo_parser/comment_parser.py diff --git a/weibo_spider/parser/index_parser.py b/weibo_spider/weibo_parser/index_parser.py similarity index 100% rename from weibo_spider/parser/index_parser.py rename to weibo_spider/weibo_parser/index_parser.py diff --git a/weibo_spider/parser/info_parser.py b/weibo_spider/weibo_parser/info_parser.py similarity index 100% rename from weibo_spider/parser/info_parser.py rename to weibo_spider/weibo_parser/info_parser.py diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/weibo_parser/page_parser.py similarity index 84% rename from weibo_spider/parser/page_parser.py rename to weibo_spider/weibo_parser/page_parser.py index 1cd87e55..d9083b7b 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/weibo_parser/page_parser.py @@ -1,15 +1,19 @@ -from datetime import datetime, timedelta -import traceback -from collections import OrderedDict import re import sys +import traceback +from collections import OrderedDict +from datetime import datetime, timedelta import requests -from .parser import Parser +import datetime_util +import printer + from .comment_parser import CommentParser -from .util import handle_html, handle_garbled -from .. import printer, datetime_util +from .parser import Parser +from .util import handle_garbled, handle_html + +sys.path.append('..') class PageParser(Parser): @@ -27,14 +31,15 @@ def get_one_page(self, since_date, weibo_id_list): is_exist = info[0].xpath("div/span[@class='ctt']") weibos = [] if is_exist: - since_date = datetime_util.str_to_time(since_date) + since_date = datetime_util.str_to_time(since_date) for i in range(0, len(info) - 2): weibo = self.get_one_weibo(info[i]) if weibo: if weibo["id"] in weibo_id_list: continue - publish_time = datetime_util.str_to_time(weibo["publish_time"]) - + publish_time = datetime_util.str_to_time( + weibo["publish_time"]) + if publish_time < since_date: if self.is_pinned_weibo(info[i]): continue @@ -60,10 +65,11 @@ def get_original_weibo(self, info, weibo_id): """获取原创微博""" try: weibo_content = handle_garbled(info) - weibo_content = weibo_content[: weibo_content.rfind(u"赞")] + weibo_content = weibo_content[:weibo_content.rfind(u"赞")] a_text = info.xpath("div//a/text()") if u"全文" in a_text: - wb_content = CommentParser(self.cookie, weibo_id).get_long_weibo() + wb_content = CommentParser(self.cookie, + weibo_id).get_long_weibo() if wb_content: weibo_content = wb_content return weibo_content @@ -75,29 +81,23 @@ def get_retweet(self, info, weibo_id): """获取转发微博""" try: weibo_content = handle_garbled(info) - weibo_content = weibo_content[ - weibo_content.find(":") + 1 : weibo_content.rfind(u"赞") - ] - weibo_content = weibo_content[: weibo_content.rfind(u"赞")] + weibo_content = weibo_content[weibo_content.find(":") + + 1:weibo_content.rfind(u"赞")] + weibo_content = weibo_content[:weibo_content.rfind(u"赞")] a_text = info.xpath("div//a/text()") if u"全文" in a_text: - wb_content = CommentParser(self.cookie, weibo_id).get_long_retweet() + wb_content = CommentParser(self.cookie, + weibo_id).get_long_retweet() if wb_content: weibo_content = wb_content retweet_reason = handle_garbled(info.xpath("div")[-1]) - retweet_reason = retweet_reason[: retweet_reason.rindex(u"赞")] + retweet_reason = retweet_reason[:retweet_reason.rindex(u"赞")] original_user = info.xpath("div/span[@class='cmt']/a/text()") if original_user: original_user = original_user[0] - weibo_content = ( - retweet_reason - + "\n" - + u"原始用户: " - + original_user - + "\n" - + u"转发内容: " - + weibo_content - ) + weibo_content = (retweet_reason + "\n" + u"原始用户: " + + original_user + "\n" + u"转发内容: " + + weibo_content) else: weibo_content = retweet_reason + "\n" + u"转发内容: " + weibo_content return weibo_content @@ -135,17 +135,13 @@ def get_publish_place(self, info): a_list = div_first.xpath("a") publish_place = u"无" for a in a_list: - if ( - "place.weibo.com" in a.xpath("@href")[0] - and a.xpath("text()")[0] == u"显示地图" - ): + if ("place.weibo.com" in a.xpath("@href")[0] + and a.xpath("text()")[0] == u"显示地图"): weibo_a = div_first.xpath("span[@class='ctt']/a") if len(weibo_a) >= 1: publish_place = weibo_a[-1] - if ( - u"视频" - == div_first.xpath("span[@class='ctt']/a/text()")[-1][-2:] - ): + if (u"视频" == div_first.xpath( + "span[@class='ctt']/a/text()")[-1][-2:]): if len(weibo_a) >= 2: publish_place = weibo_a[-2] else: @@ -166,9 +162,10 @@ def get_publish_time(self, info): if u"刚刚" in publish_time: publish_time = datetime.now().strftime("%Y-%m-%d %H:%M") elif u"分钟" in publish_time: - minute = publish_time[: publish_time.find(u"分钟")] + minute = publish_time[:publish_time.find(u"分钟")] minute = timedelta(minutes=int(minute)) - publish_time = (datetime.now() - minute).strftime("%Y-%m-%d %H:%M") + publish_time = (datetime.now() - + minute).strftime("%Y-%m-%d %H:%M") elif u"今天" in publish_time: today = datetime.now().strftime("%Y-%m-%d") time = publish_time[3:] @@ -209,7 +206,7 @@ def get_weibo_footer(self, info): pattern = r"\d+" str_footer = info.xpath("div")[-1] str_footer = handle_garbled(str_footer) - str_footer = str_footer[str_footer.rfind(u"赞") :] + str_footer = str_footer[str_footer.rfind(u"赞"):] weibo_footer = re.findall(pattern, str_footer, re.M) up_num = int(weibo_footer[0]) @@ -260,15 +257,17 @@ def get_video_url(self, info, is_original): a_list = div_first.xpath(".//a") video_link = u"无" for a in a_list: - if "m.weibo.cn/s/video/show?object_id=" in a.xpath("@href")[0]: + if "m.weibo.cn/s/video/show?object_id=" in a.xpath( + "@href")[0]: video_link = a.xpath("@href")[0] break if video_link != u"无": video_link = video_link.replace( - "m.weibo.cn/s/video/show", "m.weibo.cn/s/video/object" - ) - wb_info = requests.get(video_link, cookies=self.cookie).json() - video_url = wb_info["data"]["object"]["stream"].get("hd_url") + "m.weibo.cn/s/video/show", "m.weibo.cn/s/video/object") + wb_info = requests.get(video_link, + cookies=self.cookie).json() + video_url = wb_info["data"]["object"]["stream"].get( + "hd_url") if not video_url: video_url = wb_info["data"]["object"]["stream"]["url"] if not video_url: # 说明该视频为直播 @@ -296,18 +295,18 @@ def get_one_weibo(self, info): is_original = self.is_original(info) if (not self.filter) or is_original: weibo["id"] = info.xpath("@id")[0][2:] - weibo["content"] = self.get_weibo_content(info, is_original) # 微博内容 + weibo["content"] = self.get_weibo_content(info, + is_original) # 微博内容 weibo["article_url"] = self.get_article_url(info) # 头条文章url picture_urls = self.get_picture_urls(info, is_original) weibo["original_pictures"] = picture_urls[ - "original_pictures" - ] # 原创图片url + "original_pictures"] # 原创图片url if not self.filter: weibo["retweet_pictures"] = picture_urls[ - "retweet_pictures" - ] # 转发图片url + "retweet_pictures"] # 转发图片url weibo["original"] = is_original # 是否原创微博 - weibo["video_url"] = self.get_video_url(info, is_original) # 微博视频url + weibo["video_url"] = self.get_video_url(info, + is_original) # 微博视频url weibo["publish_place"] = self.get_publish_place(info) # 微博发布位置 weibo["publish_time"] = self.get_publish_time(info) # 微博发布时间 weibo["publish_tool"] = self.get_publish_tool(info) # 微博发布工具 @@ -335,7 +334,8 @@ def extract_picture_urls(self, info, weibo_id): selector = handle_html(self.cookie, all_pic) preview_picture_list = selector.xpath("//img/@src") picture_list = [ - p.replace("/thumb180/", "/large/") for p in preview_picture_list + p.replace("/thumb180/", "/large/") + for p in preview_picture_list ] picture_urls = ",".join(picture_list) else: @@ -344,10 +344,10 @@ def extract_picture_urls(self, info, weibo_id): if len(link.xpath("@href")) > 0: if first_pic == link.xpath("@href")[0]: if len(link.xpath("img/@src")) > 0: - preview_picture = link.xpath("img/@src")[0] + preview_picture = link.xpath( + "img/@src")[0] picture_urls = preview_picture.replace( - "/wap180/", "/large/" - ) + "/wap180/", "/large/") break else: sys.exit( diff --git a/weibo_spider/parser/parser.py b/weibo_spider/weibo_parser/parser.py similarity index 100% rename from weibo_spider/parser/parser.py rename to weibo_spider/weibo_parser/parser.py diff --git a/weibo_spider/parser/util.py b/weibo_spider/weibo_parser/util.py similarity index 100% rename from weibo_spider/parser/util.py rename to weibo_spider/weibo_parser/util.py From 842ae753506645858df300119298049e30ec5f3f Mon Sep 17 00:00:00 2001 From: dataabc Date: Sun, 7 Jun 2020 01:49:51 +0800 Subject: [PATCH 171/363] git rollback --- .../{config.json => config_sample.json} | 2 +- weibo_spider/config_util.py | 15 +-- .../{weibo_parser => parser}/__init__.py | 0 .../comment_parser.py | 0 .../{weibo_parser => parser}/index_parser.py | 0 .../{weibo_parser => parser}/info_parser.py | 0 .../{weibo_parser => parser}/page_parser.py | 104 +++++++++--------- .../{weibo_parser => parser}/parser.py | 0 weibo_spider/{weibo_parser => parser}/util.py | 0 weibo_spider/weiboSpider.py | 97 ++++++++-------- 10 files changed, 107 insertions(+), 111 deletions(-) rename weibo_spider/{config.json => config_sample.json} (99%) rename weibo_spider/{weibo_parser => parser}/__init__.py (100%) rename weibo_spider/{weibo_parser => parser}/comment_parser.py (100%) rename weibo_spider/{weibo_parser => parser}/index_parser.py (100%) rename weibo_spider/{weibo_parser => parser}/info_parser.py (100%) rename weibo_spider/{weibo_parser => parser}/page_parser.py (84%) rename weibo_spider/{weibo_parser => parser}/parser.py (100%) rename weibo_spider/{weibo_parser => parser}/util.py (100%) diff --git a/weibo_spider/config.json b/weibo_spider/config_sample.json similarity index 99% rename from weibo_spider/config.json rename to weibo_spider/config_sample.json index 88b06773..04db0f39 100644 --- a/weibo_spider/config.json +++ b/weibo_spider/config_sample.json @@ -13,4 +13,4 @@ "password": "123456", "charset": "utf8mb4" } -} \ No newline at end of file +} diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 12f6f2b8..7eb5b199 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -33,19 +33,17 @@ def validate_config(config): sys.exit(u"write_mode值应为list类型") for mode in config["write_mode"]: if mode not in write_mode: - sys.exit( - u"%s为无效模式,请从txt、csv、json、mongo和mysql中挑选一个或多个作为write_mode" % - mode) + sys.exit(u"%s为无效模式,请从txt、csv、json、mongo和mysql中挑选一个或多个作为write_mode" % mode) # 验证user_id_list user_id_list = config["user_id_list"] - if (not isinstance(user_id_list, - list)) and (not user_id_list.endswith(".txt")): + if (not isinstance(user_id_list, list)) and (not user_id_list.endswith(".txt")): sys.exit(u"user_id_list值应为list类型或txt文件路径") if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): - user_id_list = (os.path.split(os.path.realpath(__file__))[0] + - os.sep + user_id_list) + user_id_list = ( + os.path.split(os.path.realpath(__file__))[0] + os.sep + user_id_list + ) if not os.path.isfile(user_id_list): sys.exit(u"不存在%s文件" % user_id_list) @@ -76,8 +74,7 @@ def get_user_config_list(file_name, default_since_date): return user_config_list -def update_user_config_file(user_config_file_path, user_uri, nickname, - start_time): +def update_user_config_file(user_config_file_path, user_uri, nickname, start_time): """更新用户配置文件""" with open(user_config_file_path, "rb") as f: lines = f.read().splitlines() diff --git a/weibo_spider/weibo_parser/__init__.py b/weibo_spider/parser/__init__.py similarity index 100% rename from weibo_spider/weibo_parser/__init__.py rename to weibo_spider/parser/__init__.py diff --git a/weibo_spider/weibo_parser/comment_parser.py b/weibo_spider/parser/comment_parser.py similarity index 100% rename from weibo_spider/weibo_parser/comment_parser.py rename to weibo_spider/parser/comment_parser.py diff --git a/weibo_spider/weibo_parser/index_parser.py b/weibo_spider/parser/index_parser.py similarity index 100% rename from weibo_spider/weibo_parser/index_parser.py rename to weibo_spider/parser/index_parser.py diff --git a/weibo_spider/weibo_parser/info_parser.py b/weibo_spider/parser/info_parser.py similarity index 100% rename from weibo_spider/weibo_parser/info_parser.py rename to weibo_spider/parser/info_parser.py diff --git a/weibo_spider/weibo_parser/page_parser.py b/weibo_spider/parser/page_parser.py similarity index 84% rename from weibo_spider/weibo_parser/page_parser.py rename to weibo_spider/parser/page_parser.py index d9083b7b..1cd87e55 100644 --- a/weibo_spider/weibo_parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -1,19 +1,15 @@ -import re -import sys +from datetime import datetime, timedelta import traceback from collections import OrderedDict -from datetime import datetime, timedelta +import re +import sys import requests -import datetime_util -import printer - -from .comment_parser import CommentParser from .parser import Parser -from .util import handle_garbled, handle_html - -sys.path.append('..') +from .comment_parser import CommentParser +from .util import handle_html, handle_garbled +from .. import printer, datetime_util class PageParser(Parser): @@ -31,15 +27,14 @@ def get_one_page(self, since_date, weibo_id_list): is_exist = info[0].xpath("div/span[@class='ctt']") weibos = [] if is_exist: - since_date = datetime_util.str_to_time(since_date) + since_date = datetime_util.str_to_time(since_date) for i in range(0, len(info) - 2): weibo = self.get_one_weibo(info[i]) if weibo: if weibo["id"] in weibo_id_list: continue - publish_time = datetime_util.str_to_time( - weibo["publish_time"]) - + publish_time = datetime_util.str_to_time(weibo["publish_time"]) + if publish_time < since_date: if self.is_pinned_weibo(info[i]): continue @@ -65,11 +60,10 @@ def get_original_weibo(self, info, weibo_id): """获取原创微博""" try: weibo_content = handle_garbled(info) - weibo_content = weibo_content[:weibo_content.rfind(u"赞")] + weibo_content = weibo_content[: weibo_content.rfind(u"赞")] a_text = info.xpath("div//a/text()") if u"全文" in a_text: - wb_content = CommentParser(self.cookie, - weibo_id).get_long_weibo() + wb_content = CommentParser(self.cookie, weibo_id).get_long_weibo() if wb_content: weibo_content = wb_content return weibo_content @@ -81,23 +75,29 @@ def get_retweet(self, info, weibo_id): """获取转发微博""" try: weibo_content = handle_garbled(info) - weibo_content = weibo_content[weibo_content.find(":") + - 1:weibo_content.rfind(u"赞")] - weibo_content = weibo_content[:weibo_content.rfind(u"赞")] + weibo_content = weibo_content[ + weibo_content.find(":") + 1 : weibo_content.rfind(u"赞") + ] + weibo_content = weibo_content[: weibo_content.rfind(u"赞")] a_text = info.xpath("div//a/text()") if u"全文" in a_text: - wb_content = CommentParser(self.cookie, - weibo_id).get_long_retweet() + wb_content = CommentParser(self.cookie, weibo_id).get_long_retweet() if wb_content: weibo_content = wb_content retweet_reason = handle_garbled(info.xpath("div")[-1]) - retweet_reason = retweet_reason[:retweet_reason.rindex(u"赞")] + retweet_reason = retweet_reason[: retweet_reason.rindex(u"赞")] original_user = info.xpath("div/span[@class='cmt']/a/text()") if original_user: original_user = original_user[0] - weibo_content = (retweet_reason + "\n" + u"原始用户: " + - original_user + "\n" + u"转发内容: " + - weibo_content) + weibo_content = ( + retweet_reason + + "\n" + + u"原始用户: " + + original_user + + "\n" + + u"转发内容: " + + weibo_content + ) else: weibo_content = retweet_reason + "\n" + u"转发内容: " + weibo_content return weibo_content @@ -135,13 +135,17 @@ def get_publish_place(self, info): a_list = div_first.xpath("a") publish_place = u"无" for a in a_list: - if ("place.weibo.com" in a.xpath("@href")[0] - and a.xpath("text()")[0] == u"显示地图"): + if ( + "place.weibo.com" in a.xpath("@href")[0] + and a.xpath("text()")[0] == u"显示地图" + ): weibo_a = div_first.xpath("span[@class='ctt']/a") if len(weibo_a) >= 1: publish_place = weibo_a[-1] - if (u"视频" == div_first.xpath( - "span[@class='ctt']/a/text()")[-1][-2:]): + if ( + u"视频" + == div_first.xpath("span[@class='ctt']/a/text()")[-1][-2:] + ): if len(weibo_a) >= 2: publish_place = weibo_a[-2] else: @@ -162,10 +166,9 @@ def get_publish_time(self, info): if u"刚刚" in publish_time: publish_time = datetime.now().strftime("%Y-%m-%d %H:%M") elif u"分钟" in publish_time: - minute = publish_time[:publish_time.find(u"分钟")] + minute = publish_time[: publish_time.find(u"分钟")] minute = timedelta(minutes=int(minute)) - publish_time = (datetime.now() - - minute).strftime("%Y-%m-%d %H:%M") + publish_time = (datetime.now() - minute).strftime("%Y-%m-%d %H:%M") elif u"今天" in publish_time: today = datetime.now().strftime("%Y-%m-%d") time = publish_time[3:] @@ -206,7 +209,7 @@ def get_weibo_footer(self, info): pattern = r"\d+" str_footer = info.xpath("div")[-1] str_footer = handle_garbled(str_footer) - str_footer = str_footer[str_footer.rfind(u"赞"):] + str_footer = str_footer[str_footer.rfind(u"赞") :] weibo_footer = re.findall(pattern, str_footer, re.M) up_num = int(weibo_footer[0]) @@ -257,17 +260,15 @@ def get_video_url(self, info, is_original): a_list = div_first.xpath(".//a") video_link = u"无" for a in a_list: - if "m.weibo.cn/s/video/show?object_id=" in a.xpath( - "@href")[0]: + if "m.weibo.cn/s/video/show?object_id=" in a.xpath("@href")[0]: video_link = a.xpath("@href")[0] break if video_link != u"无": video_link = video_link.replace( - "m.weibo.cn/s/video/show", "m.weibo.cn/s/video/object") - wb_info = requests.get(video_link, - cookies=self.cookie).json() - video_url = wb_info["data"]["object"]["stream"].get( - "hd_url") + "m.weibo.cn/s/video/show", "m.weibo.cn/s/video/object" + ) + wb_info = requests.get(video_link, cookies=self.cookie).json() + video_url = wb_info["data"]["object"]["stream"].get("hd_url") if not video_url: video_url = wb_info["data"]["object"]["stream"]["url"] if not video_url: # 说明该视频为直播 @@ -295,18 +296,18 @@ def get_one_weibo(self, info): is_original = self.is_original(info) if (not self.filter) or is_original: weibo["id"] = info.xpath("@id")[0][2:] - weibo["content"] = self.get_weibo_content(info, - is_original) # 微博内容 + weibo["content"] = self.get_weibo_content(info, is_original) # 微博内容 weibo["article_url"] = self.get_article_url(info) # 头条文章url picture_urls = self.get_picture_urls(info, is_original) weibo["original_pictures"] = picture_urls[ - "original_pictures"] # 原创图片url + "original_pictures" + ] # 原创图片url if not self.filter: weibo["retweet_pictures"] = picture_urls[ - "retweet_pictures"] # 转发图片url + "retweet_pictures" + ] # 转发图片url weibo["original"] = is_original # 是否原创微博 - weibo["video_url"] = self.get_video_url(info, - is_original) # 微博视频url + weibo["video_url"] = self.get_video_url(info, is_original) # 微博视频url weibo["publish_place"] = self.get_publish_place(info) # 微博发布位置 weibo["publish_time"] = self.get_publish_time(info) # 微博发布时间 weibo["publish_tool"] = self.get_publish_tool(info) # 微博发布工具 @@ -334,8 +335,7 @@ def extract_picture_urls(self, info, weibo_id): selector = handle_html(self.cookie, all_pic) preview_picture_list = selector.xpath("//img/@src") picture_list = [ - p.replace("/thumb180/", "/large/") - for p in preview_picture_list + p.replace("/thumb180/", "/large/") for p in preview_picture_list ] picture_urls = ",".join(picture_list) else: @@ -344,10 +344,10 @@ def extract_picture_urls(self, info, weibo_id): if len(link.xpath("@href")) > 0: if first_pic == link.xpath("@href")[0]: if len(link.xpath("img/@src")) > 0: - preview_picture = link.xpath( - "img/@src")[0] + preview_picture = link.xpath("img/@src")[0] picture_urls = preview_picture.replace( - "/wap180/", "/large/") + "/wap180/", "/large/" + ) break else: sys.exit( diff --git a/weibo_spider/weibo_parser/parser.py b/weibo_spider/parser/parser.py similarity index 100% rename from weibo_spider/weibo_parser/parser.py rename to weibo_spider/parser/parser.py diff --git a/weibo_spider/weibo_parser/util.py b/weibo_spider/parser/util.py similarity index 100% rename from weibo_spider/weibo_parser/util.py rename to weibo_spider/parser/util.py diff --git a/weibo_spider/weiboSpider.py b/weibo_spider/weiboSpider.py index ab677b66..eb0d4d22 100644 --- a/weibo_spider/weiboSpider.py +++ b/weibo_spider/weiboSpider.py @@ -10,13 +10,13 @@ from datetime import date, datetime, timedelta from time import sleep -from absl import app, flags from tqdm import tqdm +from absl import app, flags -import config_util -import datetime_util -import printer -from weibo_parser import IndexParser, PageParser +from . import datetime_util +from . import config_util +from . import printer +from .parser import IndexParser, PageParser FLAGS = flags.FLAGS @@ -28,18 +28,18 @@ class Spider: def __init__(self, config): """Weibo类初始化""" - self.filter = config[ - "filter"] # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 + self.filter = config["filter"] # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 since_date = str(config["since_date"]) if since_date.isdigit(): since_date = str(date.today() - timedelta(int(since_date))) self.since_date = since_date # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd self.write_mode = config[ - "write_mode"] # 结果信息保存类型,为list形式,可包含txt、csv、json、mongo和mysql五种类型 - self.pic_download = config[ - "pic_download"] # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 + "write_mode" + ] # 结果信息保存类型,为list形式,可包含txt、csv、json、mongo和mysql五种类型 + self.pic_download = config["pic_download"] # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 self.video_download = config[ - "video_download"] # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 + "video_download" + ] # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 self.cookie = {"Cookie": config["cookie"]} self.mysql_config = config.get("mysql_config") # MySQL数据库连接配置,可以不填 user_id_list = config["user_id_list"] @@ -52,13 +52,14 @@ def __init__(self, config): sys.exit(u"当前路径:%s 不存在配置文件config.json" % user_id_list) self.user_config_file_path = user_id_list # 用户配置文件路径 user_config_list = config_util.get_user_config_list( - user_id_list, self.since_date) + user_id_list, self.since_date + ) else: self.user_config_file_path = "" - user_config_list = [{ - "user_uri": user_id, - "since_date": self.since_date - } for user_id in user_id_list] + user_config_list = [ + {"user_uri": user_id, "since_date": self.since_date} + for user_id in user_id_list + ] self.user_config_list = user_config_list # 要爬取的微博用户的user_config列表 self.user_config = {} # 用户配置,包含用户id和since_date self.start_time = "" # 获取用户第一条微博时的时间 @@ -89,30 +90,31 @@ def get_user_info(self, user_uri): def get_weibo_info(self): """获取微博信息""" try: - since_date = datetime_util.str_to_time( - self.user_config["since_date"]) + since_date = datetime_util.str_to_time(self.user_config["since_date"]) now = datetime.now().strftime("%Y-%m-%d %H:%M") now = datetime.strptime(now, "%Y-%m-%d %H:%M") if since_date <= now: page_num = IndexParser( - self.cookie, - self.user_config["user_uri"]).get_page_num() # 获取微博总页数 + self.cookie, self.user_config["user_uri"] + ).get_page_num() # 获取微博总页数 page1 = 0 random_pages = random.randint(1, 5) self.start_time = datetime.now().strftime("%Y-%m-%d %H:%M") for page in tqdm(range(1, page_num + 1), desc="Progress"): weibos, self.weibo_id_list = PageParser( - self.cookie, self.user_config["user_uri"], - page, self.filter).get_one_page( - self.since_date, - self.weibo_id_list) # 获取第page页的全部微博 - print(u"{}已获取{}({})的第{}页微博{}".format( - "-" * 30, - self.user["nickname"], - self.user["id"], - page, - "-" * 30, - )) + self.cookie, self.user_config["user_uri"], page, self.filter + ).get_one_page( + self.since_date, self.weibo_id_list + ) # 获取第page页的全部微博 + print( + u"{}已获取{}({})的第{}页微博{}".format( + "-" * 30, + self.user["nickname"], + self.user["id"], + page, + "-" * 30, + ) + ) if weibos: yield weibos else: @@ -135,8 +137,9 @@ def _get_filepath(self, type): if FLAGS.output_dir is not None: file_dir = FLAGS.output_dir else: - file_dir = (os.getcwd() + os.sep + "weibo" + os.sep + - self.user["nickname"]) + file_dir = ( + os.getcwd() + os.sep + "weibo" + os.sep + self.user["nickname"] + ) if type == "img" or type == "video": file_dir = file_dir + os.sep + type if not os.path.isdir(file_dir): @@ -158,38 +161,35 @@ def initialize_info(self, user_config): self.writers = [] if "csv" in self.write_mode: - from writer import CsvWriter + from .writer import CsvWriter - self.writers.append( - CsvWriter(self.filter, self._get_filepath("csv"))) + self.writers.append(CsvWriter(self.filter, self._get_filepath("csv"))) if "txt" in self.write_mode: - from writer import TxtWriter + from .writer import TxtWriter - self.writers.append( - TxtWriter(self.filter, self._get_filepath("txt"))) + self.writers.append(TxtWriter(self.filter, self._get_filepath("txt"))) if "json" in self.write_mode: - from writer import JsonWriter + from .writer import JsonWriter self.writers.append(JsonWriter(self._get_filepath("json"))) if "mysql" in self.write_mode: - from writer import MySqlWriter + from .writer import MySqlWriter self.writers.append(MySqlWriter(self.mysql_config)) if "mongo" in self.write_mode: - from writer import MongoWriter + from .writer import MongoWriter self.writers.append(MongoWriter()) self.downloaders = [] if self.pic_download == 1: - from downloader import ImgDownloader + from .downloader import ImgDownloader self.downloaders.append(ImgDownloader(self._get_filepath("img"))) if self.video_download == 1: - from downloader import VideoDownloader + from .downloader import VideoDownloader - self.downloaders.append( - VideoDownloader(self._get_filepath("video"))) + self.downloaders.append(VideoDownloader(self._get_filepath("video"))) def start(self): """运行爬虫""" @@ -226,8 +226,7 @@ def start(self): def _get_config(): """获取config.json数据""" - src = os.path.split( - os.path.realpath(__file__))[0] + os.sep + 'config_sample.json' + src = os.path.split(os.path.realpath(__file__))[0] + os.sep + 'config_sample.json' config_path = os.getcwd() + os.sep + 'config.json' if FLAGS.config_path: config_path = FLAGS.config_path @@ -256,4 +255,4 @@ def main(_): if __name__ == "__main__": - app.run(main) + app.run(main) \ No newline at end of file From 33376d149a55267629a9483a71b9702c5a230bf7 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sun, 7 Jun 2020 02:24:07 +0800 Subject: [PATCH 172/363] Update README.md --- README.md | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 6fe59d14..0a99b7c4 100644 --- a/README.md +++ b/README.md @@ -93,9 +93,9 @@ 对于上述参数的含义以及取值范围,这里仅作简单介绍,详细信息见[程序设置](#2程序设置)。 >**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](#如何获取user_id);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件、csv文件和json文件,所以它的值为["csv", "txt", "json"],如果你想写入数据库,具体设置见[设置数据库](#3设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[如何获取cookie](#如何获取cookie),获取cookie后把"your cookie"替换成真实的cookie值即可。
-cookie修改完成后运行weiboSpider.py,该文件位于weibospider=>weibo_spider: +cookie修改完成后在weiboSpider目录下运行如下命令: ```bash -$ python3 weiboSpider.py +$ python3 -m weibo_spider ``` 程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个json文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](#3设置数据库可选)部分。

@@ -186,7 +186,11 @@ $ pip install -r requirements.txt $ python3 -m pip install weibo-spider ``` ### 2.程序设置 -如果你使用的是**源码下载安装**,请打开**config.json**文件,你会看到如下内容: +**源码下载安装**的用户在weiboSpider目录下运行如下命令,**pip安装**的用户在任意有写权限的目录运行如下命令: +```bash +$ python3 -m weibo_spider +``` +第一次运行会生成**config.json**文件,请打开**config.json**文件,你会看到如下内容: ``` { "user_id_list": ["1669879400"], @@ -205,7 +209,6 @@ $ python3 -m pip install weibo-spider } } ``` -如果你使用的是**pip安装**,第一次执行[运行脚本](#4运行脚本)中的命令,程序会自动创建上面的config.json文件。
下面讲解每个参数的含义与设置方法。
**设置user_id_list**
user_id_list是我们要爬取的微博的id,可以是一个,也可以是多个,例如: @@ -312,11 +315,7 @@ MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为 ### 4.运行脚本 -**源码下载安装**的用户可以在weiboSpider.py文件所在目录下运行 -```bash -$ python3 weiboSpider.py -``` -**pip安装**的用户可以在任意有写权限的目录运行 +**源码下载安装**的用户可以在weiboSpider目录运行如下命令,**pip安装**的用户可以在任意有写权限的目录运行如下命令 ```bash $ python3 -m weibo_spider ``` From 03103a74b129bead4bef52b0acc138f1072f55a0 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sun, 7 Jun 2020 18:26:49 +0800 Subject: [PATCH 173/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8Dpypi=E7=89=88?= =?UTF-8?q?=E6=97=A0=E6=B3=95=E6=B7=BB=E5=8A=A0=E9=85=8D=E7=BD=AE=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 07809240..54acda50 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='weibo-spider', - version='0.0.5', + version='0.0.7', author='Chen Lei', author_email='chillychen1991@gmail.com', description='新浪微博爬虫,用python爬取新浪微博数据。', @@ -13,6 +13,7 @@ long_description_content_type='text/markdown', url='https://github.com/dataabc/weiboSpider', packages=setuptools.find_packages(), + package_data={'weibo_spider': ['config_sample.json']}, classifiers=[ 'Programming Language :: Python :: 3', 'Operating System :: OS Independent', From fdf32bf477c1dadb10ef7e3b56aa96255ea8bcee Mon Sep 17 00:00:00 2001 From: dataabc Date: Sun, 7 Jun 2020 19:12:17 +0800 Subject: [PATCH 174/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E8=BF=90?= =?UTF-8?q?=E8=A1=8C=E7=A8=8B=E5=BA=8F=E5=BF=85=E9=A1=BB=E5=AE=89=E8=A3=85?= =?UTF-8?q?pymysql=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/__main__.py | 4 +- weibo_spider/config_util.py | 15 ++-- weibo_spider/parser/comment_parser.py | 9 +-- weibo_spider/parser/index_parser.py | 15 ++-- weibo_spider/parser/info_parser.py | 36 ++++----- weibo_spider/parser/page_parser.py | 103 +++++++++++++------------- weibo_spider/parser/util.py | 10 +-- weibo_spider/printer.py | 1 - weibo_spider/weiboSpider.py | 79 ++++++++++---------- weibo_spider/writer/__init__.py | 2 +- weibo_spider/writer/csv_writer.py | 12 ++- weibo_spider/writer/mysql_writer.py | 28 +++---- weibo_spider/writer/txt_writer.py | 45 +++-------- 13 files changed, 171 insertions(+), 188 deletions(-) diff --git a/weibo_spider/__main__.py b/weibo_spider/__main__.py index f562cfae..fb8340a6 100644 --- a/weibo_spider/__main__.py +++ b/weibo_spider/__main__.py @@ -1,5 +1,5 @@ -from .weiboSpider import main - from absl import app +from .weiboSpider import main + app.run(main) diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 7eb5b199..12f6f2b8 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -33,17 +33,19 @@ def validate_config(config): sys.exit(u"write_mode值应为list类型") for mode in config["write_mode"]: if mode not in write_mode: - sys.exit(u"%s为无效模式,请从txt、csv、json、mongo和mysql中挑选一个或多个作为write_mode" % mode) + sys.exit( + u"%s为无效模式,请从txt、csv、json、mongo和mysql中挑选一个或多个作为write_mode" % + mode) # 验证user_id_list user_id_list = config["user_id_list"] - if (not isinstance(user_id_list, list)) and (not user_id_list.endswith(".txt")): + if (not isinstance(user_id_list, + list)) and (not user_id_list.endswith(".txt")): sys.exit(u"user_id_list值应为list类型或txt文件路径") if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): - user_id_list = ( - os.path.split(os.path.realpath(__file__))[0] + os.sep + user_id_list - ) + user_id_list = (os.path.split(os.path.realpath(__file__))[0] + + os.sep + user_id_list) if not os.path.isfile(user_id_list): sys.exit(u"不存在%s文件" % user_id_list) @@ -74,7 +76,8 @@ def get_user_config_list(file_name, default_since_date): return user_config_list -def update_user_config_file(user_config_file_path, user_uri, nickname, start_time): +def update_user_config_file(user_config_file_path, user_uri, nickname, + start_time): """更新用户配置文件""" with open(user_config_file_path, "rb") as f: lines = f.read().splitlines() diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/parser/comment_parser.py index 157f0bc0..53c2ab59 100644 --- a/weibo_spider/parser/comment_parser.py +++ b/weibo_spider/parser/comment_parser.py @@ -1,6 +1,6 @@ -from time import sleep import random import traceback +from time import sleep from .parser import Parser from .util import handle_html @@ -21,9 +21,8 @@ def get_long_weibo(self): info = self.selector.xpath("//div[@class='c']")[1] wb_content = self.handle_garbled(info) wb_time = info.xpath("//span[@class='ct']/text()")[0] - weibo_content = wb_content[ - wb_content.find(":") + 1 : wb_content.rfind(wb_time) - ] + weibo_content = wb_content[wb_content.find(":") + + 1:wb_content.rfind(wb_time)] if weibo_content is not None: return weibo_content sleep(random.randint(6, 10)) @@ -36,7 +35,7 @@ def get_long_retweet(self): """获取长转发微博""" try: wb_content = self.get_long_weibo() - weibo_content = wb_content[: wb_content.rfind(u"原文转发")] + weibo_content = wb_content[:wb_content.rfind(u"原文转发")] return weibo_content except Exception as e: print("Error: ", e) diff --git a/weibo_spider/parser/index_parser.py b/weibo_spider/parser/index_parser.py index 6bcfee66..88a10e9d 100644 --- a/weibo_spider/parser/index_parser.py +++ b/weibo_spider/parser/index_parser.py @@ -1,8 +1,8 @@ import traceback -from .util import handle_html -from .parser import Parser from .info_parser import InfoParser +from .parser import Parser +from .util import handle_html class IndexParser(Parser): @@ -18,7 +18,8 @@ def _get_user_id(self): url_list = self.selector.xpath("//div[@class='u']//a") for url in url_list: if (url.xpath("string(.)")) == u"资料": - if url.xpath("@href") and url.xpath("@href")[0].endswith("/info"): + if url.xpath("@href") and url.xpath("@href")[0].endswith( + "/info"): link = url.xpath("@href")[0] user_id = link[1:-5] break @@ -29,7 +30,8 @@ def get_user(self): try: self.user = {} self.user["id"] = self._get_user_id() - user = InfoParser(self.cookie, self.user["id"]).extract_user_info() # 获取用户信息 + user = InfoParser(self.cookie, + self.user["id"]).extract_user_info() # 获取用户信息 for k, v in user.items(): self.user[k] = v user_info = self.selector.xpath("//div[@class='tip2']/*/text()") @@ -50,9 +52,8 @@ def get_page_num(self): if self.selector.xpath("//input[@name='mp']") == []: page_num = 1 else: - page_num = (int)( - self.selector.xpath("//input[@name='mp']")[0].attrib["value"] - ) + page_num = (int)(self.selector.xpath("//input[@name='mp']") + [0].attrib["value"]) return page_num except Exception as e: print("Error: ", e) diff --git a/weibo_spider/parser/info_parser.py b/weibo_spider/parser/info_parser.py index 0b7f990f..cd52e941 100644 --- a/weibo_spider/parser/info_parser.py +++ b/weibo_spider/parser/info_parser.py @@ -1,8 +1,8 @@ -import traceback import sys +import traceback -from .util import handle_html from .parser import Parser +from .util import handle_html class InfoParser(Parser): @@ -36,21 +36,23 @@ def extract_user_info(self): user[i] = "" for i in basic_info: if i.split(":", 1)[0] in zh_list: - user[en_list[zh_list.index(i.split(":", 1)[0])]] = i.split(":", 1)[ - 1 - ].replace("\u3000", "") - if self.selector.xpath("//div[@class='tip'][2]/text()")[0] == u"学习经历": - user["education"] = self.selector.xpath("//div[@class='c'][4]/text()")[ - 0 - ][1:].replace(u"\xa0", u" ") - if self.selector.xpath("//div[@class='tip'][3]/text()")[0] == u"工作经历": - user["work"] = self.selector.xpath("//div[@class='c'][5]/text()")[ - 0 - ][1:].replace(u"\xa0", u" ") - elif self.selector.xpath("//div[@class='tip'][2]/text()")[0] == u"工作经历": - user["work"] = self.selector.xpath("//div[@class='c'][4]/text()")[0][ - 1: - ].replace(u"\xa0", u" ") + user[en_list[zh_list.index(i.split(":", 1)[0])]] = i.split( + ":", 1)[1].replace("\u3000", "") + if self.selector.xpath( + "//div[@class='tip'][2]/text()")[0] == u"学习经历": + user["education"] = self.selector.xpath( + "//div[@class='c'][4]/text()")[0][1:].replace( + u"\xa0", u" ") + if self.selector.xpath( + "//div[@class='tip'][3]/text()")[0] == u"工作经历": + user["work"] = self.selector.xpath( + "//div[@class='c'][5]/text()")[0][1:].replace( + u"\xa0", u" ") + elif self.selector.xpath( + "//div[@class='tip'][2]/text()")[0] == u"工作经历": + user["work"] = self.selector.xpath( + "//div[@class='c'][4]/text()")[0][1:].replace( + u"\xa0", u" ") return user except Exception as e: print("Error: ", e) diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index 1cd87e55..81b7a72f 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -1,15 +1,15 @@ -from datetime import datetime, timedelta -import traceback -from collections import OrderedDict import re import sys +import traceback +from collections import OrderedDict +from datetime import datetime, timedelta import requests -from .parser import Parser +from .. import datetime_util, printer from .comment_parser import CommentParser -from .util import handle_html, handle_garbled -from .. import printer, datetime_util +from .parser import Parser +from .util import handle_garbled, handle_html class PageParser(Parser): @@ -27,14 +27,15 @@ def get_one_page(self, since_date, weibo_id_list): is_exist = info[0].xpath("div/span[@class='ctt']") weibos = [] if is_exist: - since_date = datetime_util.str_to_time(since_date) + since_date = datetime_util.str_to_time(since_date) for i in range(0, len(info) - 2): weibo = self.get_one_weibo(info[i]) if weibo: if weibo["id"] in weibo_id_list: continue - publish_time = datetime_util.str_to_time(weibo["publish_time"]) - + publish_time = datetime_util.str_to_time( + weibo["publish_time"]) + if publish_time < since_date: if self.is_pinned_weibo(info[i]): continue @@ -60,10 +61,11 @@ def get_original_weibo(self, info, weibo_id): """获取原创微博""" try: weibo_content = handle_garbled(info) - weibo_content = weibo_content[: weibo_content.rfind(u"赞")] + weibo_content = weibo_content[:weibo_content.rfind(u"赞")] a_text = info.xpath("div//a/text()") if u"全文" in a_text: - wb_content = CommentParser(self.cookie, weibo_id).get_long_weibo() + wb_content = CommentParser(self.cookie, + weibo_id).get_long_weibo() if wb_content: weibo_content = wb_content return weibo_content @@ -75,31 +77,26 @@ def get_retweet(self, info, weibo_id): """获取转发微博""" try: weibo_content = handle_garbled(info) - weibo_content = weibo_content[ - weibo_content.find(":") + 1 : weibo_content.rfind(u"赞") - ] - weibo_content = weibo_content[: weibo_content.rfind(u"赞")] + weibo_content = weibo_content[weibo_content.find(":") + + 1:weibo_content.rfind(u"赞")] + weibo_content = weibo_content[:weibo_content.rfind(u"赞")] a_text = info.xpath("div//a/text()") if u"全文" in a_text: - wb_content = CommentParser(self.cookie, weibo_id).get_long_retweet() + wb_content = CommentParser(self.cookie, + weibo_id).get_long_retweet() if wb_content: weibo_content = wb_content retweet_reason = handle_garbled(info.xpath("div")[-1]) - retweet_reason = retweet_reason[: retweet_reason.rindex(u"赞")] + retweet_reason = retweet_reason[:retweet_reason.rindex(u"赞")] original_user = info.xpath("div/span[@class='cmt']/a/text()") if original_user: original_user = original_user[0] - weibo_content = ( - retweet_reason - + "\n" - + u"原始用户: " - + original_user - + "\n" - + u"转发内容: " - + weibo_content - ) + weibo_content = (retweet_reason + "\n" + u"原始用户: " + + original_user + "\n" + u"转发内容: " + + weibo_content) else: - weibo_content = retweet_reason + "\n" + u"转发内容: " + weibo_content + weibo_content = (retweet_reason + "\n" + u"转发内容: " + + weibo_content) return weibo_content except Exception as e: print("Error: ", e) @@ -135,17 +132,13 @@ def get_publish_place(self, info): a_list = div_first.xpath("a") publish_place = u"无" for a in a_list: - if ( - "place.weibo.com" in a.xpath("@href")[0] - and a.xpath("text()")[0] == u"显示地图" - ): + if ("place.weibo.com" in a.xpath("@href")[0] + and a.xpath("text()")[0] == u"显示地图"): weibo_a = div_first.xpath("span[@class='ctt']/a") if len(weibo_a) >= 1: publish_place = weibo_a[-1] - if ( - u"视频" - == div_first.xpath("span[@class='ctt']/a/text()")[-1][-2:] - ): + if (u"视频" == div_first.xpath( + "span[@class='ctt']/a/text()")[-1][-2:]): if len(weibo_a) >= 2: publish_place = weibo_a[-2] else: @@ -166,9 +159,10 @@ def get_publish_time(self, info): if u"刚刚" in publish_time: publish_time = datetime.now().strftime("%Y-%m-%d %H:%M") elif u"分钟" in publish_time: - minute = publish_time[: publish_time.find(u"分钟")] + minute = publish_time[:publish_time.find(u"分钟")] minute = timedelta(minutes=int(minute)) - publish_time = (datetime.now() - minute).strftime("%Y-%m-%d %H:%M") + publish_time = (datetime.now() - + minute).strftime("%Y-%m-%d %H:%M") elif u"今天" in publish_time: today = datetime.now().strftime("%Y-%m-%d") time = publish_time[3:] @@ -209,7 +203,7 @@ def get_weibo_footer(self, info): pattern = r"\d+" str_footer = info.xpath("div")[-1] str_footer = handle_garbled(str_footer) - str_footer = str_footer[str_footer.rfind(u"赞") :] + str_footer = str_footer[str_footer.rfind(u"赞"):] weibo_footer = re.findall(pattern, str_footer, re.M) up_num = int(weibo_footer[0]) @@ -260,15 +254,17 @@ def get_video_url(self, info, is_original): a_list = div_first.xpath(".//a") video_link = u"无" for a in a_list: - if "m.weibo.cn/s/video/show?object_id=" in a.xpath("@href")[0]: + if "m.weibo.cn/s/video/show?object_id=" in a.xpath( + "@href")[0]: video_link = a.xpath("@href")[0] break if video_link != u"无": video_link = video_link.replace( - "m.weibo.cn/s/video/show", "m.weibo.cn/s/video/object" - ) - wb_info = requests.get(video_link, cookies=self.cookie).json() - video_url = wb_info["data"]["object"]["stream"].get("hd_url") + "m.weibo.cn/s/video/show", "m.weibo.cn/s/video/object") + wb_info = requests.get(video_link, + cookies=self.cookie).json() + video_url = wb_info["data"]["object"]["stream"].get( + "hd_url") if not video_url: video_url = wb_info["data"]["object"]["stream"]["url"] if not video_url: # 说明该视频为直播 @@ -296,18 +292,18 @@ def get_one_weibo(self, info): is_original = self.is_original(info) if (not self.filter) or is_original: weibo["id"] = info.xpath("@id")[0][2:] - weibo["content"] = self.get_weibo_content(info, is_original) # 微博内容 + weibo["content"] = self.get_weibo_content(info, + is_original) # 微博内容 weibo["article_url"] = self.get_article_url(info) # 头条文章url picture_urls = self.get_picture_urls(info, is_original) weibo["original_pictures"] = picture_urls[ - "original_pictures" - ] # 原创图片url + "original_pictures"] # 原创图片url if not self.filter: weibo["retweet_pictures"] = picture_urls[ - "retweet_pictures" - ] # 转发图片url + "retweet_pictures"] # 转发图片url weibo["original"] = is_original # 是否原创微博 - weibo["video_url"] = self.get_video_url(info, is_original) # 微博视频url + weibo["video_url"] = self.get_video_url(info, + is_original) # 微博视频url weibo["publish_place"] = self.get_publish_place(info) # 微博发布位置 weibo["publish_time"] = self.get_publish_time(info) # 微博发布时间 weibo["publish_tool"] = self.get_publish_tool(info) # 微博发布工具 @@ -335,7 +331,8 @@ def extract_picture_urls(self, info, weibo_id): selector = handle_html(self.cookie, all_pic) preview_picture_list = selector.xpath("//img/@src") picture_list = [ - p.replace("/thumb180/", "/large/") for p in preview_picture_list + p.replace("/thumb180/", "/large/") + for p in preview_picture_list ] picture_urls = ",".join(picture_list) else: @@ -344,10 +341,10 @@ def extract_picture_urls(self, info, weibo_id): if len(link.xpath("@href")) > 0: if first_pic == link.xpath("@href")[0]: if len(link.xpath("img/@src")) > 0: - preview_picture = link.xpath("img/@src")[0] + preview_picture = link.xpath( + "img/@src")[0] picture_urls = preview_picture.replace( - "/wap180/", "/large/" - ) + "/wap180/", "/large/") break else: sys.exit( diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py index aff70ad4..ec5b1f7d 100644 --- a/weibo_spider/parser/util.py +++ b/weibo_spider/parser/util.py @@ -1,8 +1,8 @@ import sys import traceback -from lxml import etree import requests +from lxml import etree def handle_html(cookie, url): @@ -19,12 +19,8 @@ def handle_html(cookie, url): def handle_garbled(info): """处理乱码""" try: - info = ( - info.xpath("string(.)") - .replace(u"\u200b", "") - .encode(sys.stdout.encoding, "ignore") - .decode(sys.stdout.encoding) - ) + info = (info.xpath("string(.)").replace(u"\u200b", "").encode( + sys.stdout.encoding, "ignore").decode(sys.stdout.encoding)) return info except Exception as e: print("Error: ", e) diff --git a/weibo_spider/printer.py b/weibo_spider/printer.py index 7f1ae262..b1fa15d6 100644 --- a/weibo_spider/printer.py +++ b/weibo_spider/printer.py @@ -21,4 +21,3 @@ def print_user_info(user): print(u"微博数: %d" % user["weibo_num"]) print(u"关注数: %d" % user["following"]) print(u"粉丝数: %d" % user["followers"]) - diff --git a/weibo_spider/weiboSpider.py b/weibo_spider/weiboSpider.py index eb0d4d22..9a45d0ee 100644 --- a/weibo_spider/weiboSpider.py +++ b/weibo_spider/weiboSpider.py @@ -10,12 +10,10 @@ from datetime import date, datetime, timedelta from time import sleep -from tqdm import tqdm from absl import app, flags +from tqdm import tqdm -from . import datetime_util -from . import config_util -from . import printer +from . import config_util, datetime_util, printer from .parser import IndexParser, PageParser FLAGS = flags.FLAGS @@ -28,18 +26,18 @@ class Spider: def __init__(self, config): """Weibo类初始化""" - self.filter = config["filter"] # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 + self.filter = config[ + "filter"] # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 since_date = str(config["since_date"]) if since_date.isdigit(): since_date = str(date.today() - timedelta(int(since_date))) self.since_date = since_date # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd self.write_mode = config[ - "write_mode" - ] # 结果信息保存类型,为list形式,可包含txt、csv、json、mongo和mysql五种类型 - self.pic_download = config["pic_download"] # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 + "write_mode"] # 结果信息保存类型,为list形式,可包含txt、csv、json、mongo和mysql五种类型 + self.pic_download = config[ + "pic_download"] # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 self.video_download = config[ - "video_download" - ] # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 + "video_download"] # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 self.cookie = {"Cookie": config["cookie"]} self.mysql_config = config.get("mysql_config") # MySQL数据库连接配置,可以不填 user_id_list = config["user_id_list"] @@ -52,14 +50,13 @@ def __init__(self, config): sys.exit(u"当前路径:%s 不存在配置文件config.json" % user_id_list) self.user_config_file_path = user_id_list # 用户配置文件路径 user_config_list = config_util.get_user_config_list( - user_id_list, self.since_date - ) + user_id_list, self.since_date) else: self.user_config_file_path = "" - user_config_list = [ - {"user_uri": user_id, "since_date": self.since_date} - for user_id in user_id_list - ] + user_config_list = [{ + "user_uri": user_id, + "since_date": self.since_date + } for user_id in user_id_list] self.user_config_list = user_config_list # 要爬取的微博用户的user_config列表 self.user_config = {} # 用户配置,包含用户id和since_date self.start_time = "" # 获取用户第一条微博时的时间 @@ -90,31 +87,30 @@ def get_user_info(self, user_uri): def get_weibo_info(self): """获取微博信息""" try: - since_date = datetime_util.str_to_time(self.user_config["since_date"]) + since_date = datetime_util.str_to_time( + self.user_config["since_date"]) now = datetime.now().strftime("%Y-%m-%d %H:%M") now = datetime.strptime(now, "%Y-%m-%d %H:%M") if since_date <= now: page_num = IndexParser( - self.cookie, self.user_config["user_uri"] - ).get_page_num() # 获取微博总页数 + self.cookie, + self.user_config["user_uri"]).get_page_num() # 获取微博总页数 page1 = 0 random_pages = random.randint(1, 5) self.start_time = datetime.now().strftime("%Y-%m-%d %H:%M") for page in tqdm(range(1, page_num + 1), desc="Progress"): weibos, self.weibo_id_list = PageParser( - self.cookie, self.user_config["user_uri"], page, self.filter - ).get_one_page( - self.since_date, self.weibo_id_list - ) # 获取第page页的全部微博 - print( - u"{}已获取{}({})的第{}页微博{}".format( - "-" * 30, - self.user["nickname"], - self.user["id"], - page, - "-" * 30, - ) - ) + self.cookie, self.user_config["user_uri"], + page, self.filter).get_one_page( + self.since_date, + self.weibo_id_list) # 获取第page页的全部微博 + print(u"{}已获取{}({})的第{}页微博{}".format( + "-" * 30, + self.user["nickname"], + self.user["id"], + page, + "-" * 30, + )) if weibos: yield weibos else: @@ -137,9 +133,8 @@ def _get_filepath(self, type): if FLAGS.output_dir is not None: file_dir = FLAGS.output_dir else: - file_dir = ( - os.getcwd() + os.sep + "weibo" + os.sep + self.user["nickname"] - ) + file_dir = (os.getcwd() + os.sep + "weibo" + os.sep + + self.user["nickname"]) if type == "img" or type == "video": file_dir = file_dir + os.sep + type if not os.path.isdir(file_dir): @@ -163,11 +158,13 @@ def initialize_info(self, user_config): if "csv" in self.write_mode: from .writer import CsvWriter - self.writers.append(CsvWriter(self.filter, self._get_filepath("csv"))) + self.writers.append( + CsvWriter(self.filter, self._get_filepath("csv"))) if "txt" in self.write_mode: from .writer import TxtWriter - self.writers.append(TxtWriter(self.filter, self._get_filepath("txt"))) + self.writers.append( + TxtWriter(self.filter, self._get_filepath("txt"))) if "json" in self.write_mode: from .writer import JsonWriter @@ -189,7 +186,8 @@ def initialize_info(self, user_config): if self.video_download == 1: from .downloader import VideoDownloader - self.downloaders.append(VideoDownloader(self._get_filepath("video"))) + self.downloaders.append( + VideoDownloader(self._get_filepath("video"))) def start(self): """运行爬虫""" @@ -226,7 +224,8 @@ def start(self): def _get_config(): """获取config.json数据""" - src = os.path.split(os.path.realpath(__file__))[0] + os.sep + 'config_sample.json' + src = os.path.split( + os.path.realpath(__file__))[0] + os.sep + 'config_sample.json' config_path = os.getcwd() + os.sep + 'config.json' if FLAGS.config_path: config_path = FLAGS.config_path @@ -255,4 +254,4 @@ def main(_): if __name__ == "__main__": - app.run(main) \ No newline at end of file + app.run(main) diff --git a/weibo_spider/writer/__init__.py b/weibo_spider/writer/__init__.py index df27490f..95fc931b 100644 --- a/weibo_spider/writer/__init__.py +++ b/weibo_spider/writer/__init__.py @@ -1,7 +1,7 @@ from .csv_writer import CsvWriter -from .txt_writer import TxtWriter from .json_writer import JsonWriter from .mongo_writer import MongoWriter from .mysql_writer import MySqlWriter +from .txt_writer import TxtWriter __all__ = [CsvWriter, TxtWriter, JsonWriter, MongoWriter, MySqlWriter] diff --git a/weibo_spider/writer/csv_writer.py b/weibo_spider/writer/csv_writer.py index 9cd93a3a..1e60039d 100644 --- a/weibo_spider/writer/csv_writer.py +++ b/weibo_spider/writer/csv_writer.py @@ -1,6 +1,6 @@ -import sys import codecs import csv +import sys import traceback from .writer import Writer @@ -39,7 +39,10 @@ def write_user(self, user): writer = csv.writer(f) writer.writerows([result_headers]) else: # python3.x - with open(self.file_path, "a", encoding="utf-8-sig", newline="") as f: + with open(self.file_path, + "a", + encoding="utf-8-sig", + newline="") as f: writer = csv.writer(f) writer.writerows([result_headers]) except Exception as e: @@ -58,7 +61,10 @@ def write_weibo(self, weibos): writer = csv.writer(f) writer.writerows(result_data) else: # python3.x - with open(self.file_path, "a", encoding="utf-8-sig", newline="") as f: + with open(self.file_path, + "a", + encoding="utf-8-sig", + newline="") as f: writer = csv.writer(f) writer.writerows(result_data) print(u"%d条微博写入csv文件完毕,保存路径:" % len(weibos)) diff --git a/weibo_spider/writer/mysql_writer.py b/weibo_spider/writer/mysql_writer.py index 2d293e33..d4fb84bc 100644 --- a/weibo_spider/writer/mysql_writer.py +++ b/weibo_spider/writer/mysql_writer.py @@ -2,12 +2,6 @@ import sys import traceback -try: - import pymysql -except ImportError: - sys.exit(u"系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序") - - from .writer import Writer @@ -25,6 +19,10 @@ def _mysql_create(self, connection, sql): def _mysql_create_database(self, sql): """创建MySQL数据库""" + try: + import pymysql + except ImportError: + sys.exit(u"系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序") try: print(self.mysql_config, sql) connection = pymysql.connect(**self.mysql_config) @@ -34,12 +32,14 @@ def _mysql_create_database(self, sql): def _mysql_create_table(self, sql): """创建MySQL表""" + import pymysql self.mysql_config["db"] = "weibo" connection = pymysql.connect(**self.mysql_config) self._mysql_create(connection, sql) def _mysql_insert(self, table, data_list): """向MySQL表插入或更新数据""" + import pymysql if len(data_list) > 0: keys = ", ".join(data_list[0].keys()) values = ", ".join(["%s"] * len(data_list[0])) @@ -47,15 +47,17 @@ def _mysql_insert(self, table, data_list): connection = pymysql.connect(**self.mysql_config) cursor = connection.cursor() sql = """INSERT INTO {table}({keys}) VALUES ({values}) ON - DUPLICATE KEY UPDATE""".format( - table=table, keys=keys, values=values - ) - update = ",".join( - [" {key} = values({key})".format(key=key) for key in data_list[0]] - ) + DUPLICATE KEY UPDATE""".format(table=table, + keys=keys, + values=values) + update = ",".join([ + " {key} = values({key})".format(key=key) + for key in data_list[0] + ]) sql += update try: - cursor.executemany(sql, [tuple(data.values()) for data in data_list]) + cursor.executemany( + sql, [tuple(data.values()) for data in data_list]) connection.commit() except Exception as e: connection.rollback() diff --git a/weibo_spider/writer/txt_writer.py b/weibo_spider/writer/txt_writer.py index 195b2cfa..d05bb101 100644 --- a/weibo_spider/writer/txt_writer.py +++ b/weibo_spider/writer/txt_writer.py @@ -15,19 +15,11 @@ def write_user(self, user): result_header = u"\n\n原创微博内容: \n" else: result_header = u"\n\n微博内容: \n" - result_header = ( - u"用户信息\n用户昵称:" - + self.user["nickname"] - + u"\n用户id: " - + str(self.user["id"]) - + u"\n微博数: " - + str(self.user["weibo_num"]) - + u"\n关注数: " - + str(self.user["following"]) - + u"\n粉丝数: " - + str(self.user["followers"]) - + result_header - ) + result_header = (u"用户信息\n用户昵称:" + self.user["nickname"] + u"\n用户id: " + + str(self.user["id"]) + u"\n微博数: " + + str(self.user["weibo_num"]) + u"\n关注数: " + + str(self.user["following"]) + u"\n粉丝数: " + + str(self.user["followers"]) + result_header) with open(self.file_path, "ab") as f: f.write(result_header.encode(sys.stdout.encoding)) @@ -37,26 +29,13 @@ def write_weibo(self, weibo): try: temp_result = [] for i, w in enumerate(weibo): - temp_result.append( - w["content"] - + "\n" - + u"微博位置: " - + w["publish_place"] - + "\n" - + u"发布时间: " - + w["publish_time"] - + "\n" - + u"点赞数: " - + str(w["up_num"]) - + u" 转发数: " - + str(w["retweet_num"]) - + u" 评论数: " - + str(w["comment_num"]) - + "\n" - + u"发布工具: " - + w["publish_tool"] - + "\n\n" - ) + temp_result.append(w["content"] + "\n" + u"微博位置: " + + w["publish_place"] + "\n" + u"发布时间: " + + w["publish_time"] + "\n" + u"点赞数: " + + str(w["up_num"]) + u" 转发数: " + + str(w["retweet_num"]) + u" 评论数: " + + str(w["comment_num"]) + "\n" + u"发布工具: " + + w["publish_tool"] + "\n\n") result = "".join(temp_result) with open(self.file_path, "ab") as f: f.write(result.encode(sys.stdout.encoding)) From 57a541d91e7dc5b650a94b2c4d1bc0ce708dc94f Mon Sep 17 00:00:00 2001 From: dataabc Date: Sun, 7 Jun 2020 19:22:34 +0800 Subject: [PATCH 175/363] =?UTF-8?q?perf:=20=E5=88=A0=E9=99=A4python2?= =?UTF-8?q?=E7=9B=B8=E5=85=B3=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/writer/csv_writer.py | 38 +++++++------------------------ 1 file changed, 8 insertions(+), 30 deletions(-) diff --git a/weibo_spider/writer/csv_writer.py b/weibo_spider/writer/csv_writer.py index 1e60039d..62f4528d 100644 --- a/weibo_spider/writer/csv_writer.py +++ b/weibo_spider/writer/csv_writer.py @@ -1,6 +1,4 @@ -import codecs import csv -import sys import traceback from .writer import Writer @@ -31,20 +29,10 @@ def write_user(self, user): result_headers.insert(4, "被转发微博原始图片url") result_headers.insert(5, "是否为原创微博") try: - if sys.version < "3": # python2.x - reload(sys) - sys.setdefaultencoding("utf-8") - with open(self.file_path, "ab") as f: - f.write(codecs.BOM_UTF8) - writer = csv.writer(f) - writer.writerows([result_headers]) - else: # python3.x - with open(self.file_path, - "a", - encoding="utf-8-sig", - newline="") as f: - writer = csv.writer(f) - writer.writerows([result_headers]) + with open(self.file_path, "a", encoding="utf-8-sig", + newline="") as f: + writer = csv.writer(f) + writer.writerows([result_headers]) except Exception as e: print("Error: ", e) traceback.print_exc() @@ -53,20 +41,10 @@ def write_weibo(self, weibos): """将爬取的信息写入csv文件""" try: result_data = [w.values() for w in weibos] - if sys.version < "3": # python2.x - reload(sys) - sys.setdefaultencoding("utf-8") - with open(self.file_path, "ab") as f: - f.write(codecs.BOM_UTF8) - writer = csv.writer(f) - writer.writerows(result_data) - else: # python3.x - with open(self.file_path, - "a", - encoding="utf-8-sig", - newline="") as f: - writer = csv.writer(f) - writer.writerows(result_data) + with open(self.file_path, "a", encoding="utf-8-sig", + newline="") as f: + writer = csv.writer(f) + writer.writerows(result_data) print(u"%d条微博写入csv文件完毕,保存路径:" % len(weibos)) print(self.file_path) except Exception as e: From c6ffb68ac30b2140540cec6442e75f97ccb3a071 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sun, 7 Jun 2020 19:38:47 +0800 Subject: [PATCH 176/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E7=88=AC?= =?UTF-8?q?=E5=8F=96=E6=95=B0=E7=9B=AE=E7=BB=9F=E8=AE=A1=E4=B8=8D=E5=87=86?= =?UTF-8?q?=E7=A1=AE=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/weiboSpider.py | 1 + 1 file changed, 1 insertion(+) diff --git a/weibo_spider/weiboSpider.py b/weibo_spider/weiboSpider.py index 9a45d0ee..3b23b7c8 100644 --- a/weibo_spider/weiboSpider.py +++ b/weibo_spider/weiboSpider.py @@ -203,6 +203,7 @@ def start(self): for weibos in self.get_weibo_info(): self.write_weibo(weibos) + self.got_num += len(weibos) if not self.filter: print(u"共爬取" + str(self.got_num) + u"条微博") else: From d0d91230bcb49dd4f2ec2f873c19dc748479547c Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Mon, 8 Jun 2020 02:18:24 +0800 Subject: [PATCH 177/363] Create CONTRIBUTING.md --- CONTRIBUTING.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..7d6b4a4c --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,15 @@ +## Python风格规范(建议Python新手阅读) +参考[Python风格规范](https://zh-google-styleguide.readthedocs.io/en/latest/google-python-styleguide/python_style_rules/) +或者[Python风格规范](https://github.com/zh-google-styleguide/zh-google-styleguide/blob/master/google-python-styleguide/python_style_rules.rst) +二者内容是一样的。 +## git提交规范 +参考[Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) +或者[Git提交规范](https://zhuanlan.zhihu.com/p/67804026),commit描述中文英文皆可,只要符号规范就好。 +## Python之linter +本项目使用flake8。 +## Python之formatter +本项目使用yapf。 +## 引号的使用 +代码中**建议使用单引号**,只有在特殊情况下使用双引号如类、方法、函数等开头的注释使用6个双引号包括(注释左边三个双引号,右边三个双引号),或者字符串中中已经包含单引号了,则要用双引号包裹。 +## 避免过多的模块依赖 +除非有必要,尽量少使用非内置的模块,因为会增加用户的安装成本,当然如果该模块能够为本项目或用户带来很多便利,则可以使用。 From 14e3b380332b510ba7dbc539ec58424a35b3816d Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Mon, 8 Jun 2020 02:25:58 +0800 Subject: [PATCH 178/363] Update CONTRIBUTING.md --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7d6b4a4c..68547208 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,7 +4,7 @@ 二者内容是一样的。 ## git提交规范 参考[Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) -或者[Git提交规范](https://zhuanlan.zhihu.com/p/67804026),commit描述中文英文皆可,只要符号规范就好。 +或者[Git提交规范](https://zhuanlan.zhihu.com/p/67804026),commit描述中文英文皆可,只要符合规范就好。 ## Python之linter 本项目使用flake8。 ## Python之formatter From 7fc570c12eb31a10d212265e4dbcf714b2ea73d4 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Mon, 8 Jun 2020 12:36:04 +0800 Subject: [PATCH 179/363] Update CONTRIBUTING.md --- CONTRIBUTING.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 68547208..d54821d9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,3 +1,5 @@ +## 贡献代码之前 +如果要开发新功能或者其它需要大量编写代码的修改,在开发之前最好发Issue说明一下。比如,“我准备开发xx新功能”或者“我想修改xx功能”之类的。因为要开发的功能不一定适合本项目,所以提前说明讨论,判断新功能或修改是否有必要。否则,费时费力写了很多代码,结果最后没有被采纳,可能会做一些无用功。 ## Python风格规范(建议Python新手阅读) 参考[Python风格规范](https://zh-google-styleguide.readthedocs.io/en/latest/google-python-styleguide/python_style_rules/) 或者[Python风格规范](https://github.com/zh-google-styleguide/zh-google-styleguide/blob/master/google-python-styleguide/python_style_rules.rst) @@ -5,6 +7,8 @@ ## git提交规范 参考[Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) 或者[Git提交规范](https://zhuanlan.zhihu.com/p/67804026),commit描述中文英文皆可,只要符合规范就好。 +## git提交建议(可选) +本建议是可选的,如果你觉得不合理,可以按自己的方式编写代码。建议每次提交都是代码改动较少的提交,如果新功能需要大量修改代码,除非不得已,否则建议将新功能分成几个小模块,每个模块提交一次。原因是这样更容易管理代码。比如,一个新功能包含几个模块。其中大部分模块都写的很多,但是有一个模块有bug。分模块提交只需要单独处理出问题的模块,其他模块不受影响。 ## Python之linter 本项目使用flake8。 ## Python之formatter From 6caab553b9c9ba65f67f569c8629f96d942f1e1f Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Mon, 8 Jun 2020 12:37:44 +0800 Subject: [PATCH 180/363] Update CONTRIBUTING.md --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d54821d9..7b7ffc89 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -8,7 +8,7 @@ 参考[Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) 或者[Git提交规范](https://zhuanlan.zhihu.com/p/67804026),commit描述中文英文皆可,只要符合规范就好。 ## git提交建议(可选) -本建议是可选的,如果你觉得不合理,可以按自己的方式编写代码。建议每次提交都是代码改动较少的提交,如果新功能需要大量修改代码,除非不得已,否则建议将新功能分成几个小模块,每个模块提交一次。原因是这样更容易管理代码。比如,一个新功能包含几个模块。其中大部分模块都写的很多,但是有一个模块有bug。分模块提交只需要单独处理出问题的模块,其他模块不受影响。 +本建议是可选的,如果你觉得不合理,可以按自己的方式编写代码。建议每次提交都是代码改动较少的提交,如果新功能需要大量修改代码,建议将新功能分成几个小模块,每个模块提交一次。原因是这样更容易管理代码。比如,一个新功能包含几个模块。其中大部分模块都写的很好,但是有一个模块有bug。分模块提交只需要单独处理出问题的模块,其他模块不受影响。 ## Python之linter 本项目使用flake8。 ## Python之formatter From 0688fcfc07fe31b537f31dc220e232459621e715 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Mon, 8 Jun 2020 12:39:05 +0800 Subject: [PATCH 181/363] Update CONTRIBUTING.md --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7b7ffc89..4661f48c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -14,6 +14,6 @@ ## Python之formatter 本项目使用yapf。 ## 引号的使用 -代码中**建议使用单引号**,只有在特殊情况下使用双引号如类、方法、函数等开头的注释使用6个双引号包括(注释左边三个双引号,右边三个双引号),或者字符串中中已经包含单引号了,则要用双引号包裹。 +代码中**建议使用单引号**,只有在特殊情况下使用双引号,如类、方法、函数等开头的注释使用6个双引号包裹(注释左边三个双引号,右边三个双引号),或者字符串中中已经包含单引号了,则要用双引号包裹。 ## 避免过多的模块依赖 除非有必要,尽量少使用非内置的模块,因为会增加用户的安装成本,当然如果该模块能够为本项目或用户带来很多便利,则可以使用。 From 9541088278dbff0d1abc32e3f1d8c490c03929b4 Mon Sep 17 00:00:00 2001 From: dataabc Date: Tue, 9 Jun 2020 20:37:44 +0800 Subject: [PATCH 182/363] =?UTF-8?q?docs:=20=E4=BC=98=E5=8C=96=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E6=96=87=E6=A1=A3=E7=BB=93=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 142 ++++++++---------------------------------------- docs/cookie.md | 9 +++ docs/example.md | 86 +++++++++++++++++++++++++++++ docs/userid.md | 11 ++++ setup.py | 2 +- 5 files changed, 130 insertions(+), 120 deletions(-) create mode 100644 docs/cookie.md create mode 100644 docs/example.md create mode 100644 docs/userid.md diff --git a/README.md b/README.md index 0a99b7c4..e65e502e 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,5 @@ -* [功能](#功能) -* [输出](#输出) -* [实例](#实例) -* [运行环境](#运行环境) -* [使用说明](#使用说明) -* [定期自动爬取微博(可选)](#定期自动爬取微博可选) -* [如何获取cookie](#如何获取cookie) -* [如何获取user_id](#如何获取user_id) -* [如何获取大量user_id](#如何获取大量user_id) -* [注意事项](#注意事项) - -## 功能 -连续爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入**文件**或**数据库**。写入信息几乎包括了用户微博的所有数据,主要有**用户信息**和**微博信息**两大类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[输出](#输出)部分。
+# Weibo Spider +本程序可以连续爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入**文件**或**数据库**。写入信息几乎包括了用户微博的所有数据,主要有**用户信息**和**微博信息**两大类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[输出](#输出)部分。
具体的写入文件类型如下: - 写入**txt文件**(默认) - 写入**csv文件**(默认) @@ -28,9 +17,19 @@ 程序也可以实现**爬取结果自动更新**,即:现在爬取了目标用户的微博,几天之后,目标用户可能又发新微博了。通过设置,可以实现每隔几天**增量爬取**用户这几天发的新微博。具体方法见[定期自动爬取微博](#定期自动爬取微博可选)。
本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需[免cookie版](https://github.com/dataabc/weibo-crawler),大家可以访问,二者功能类似,免cookie版获取的信息更多,用法更简单,而且不需要cookie。
如果想要获得**大量**微博,见[如何获取大量user_id](#如何获取大量user_id)部分。
-
-另外,推荐下另一个程序[weibo-search](https://github.com/dataabc/weibo-search)。该程序可以连续获取一个或多个**微博关键词搜索**结果,并将结果写入文件(可选)、数据库(可选)等。所谓微博关键词搜索即:**搜索正文中包含指定关键词的微博**,可以指定搜索的时间范围。对于非常热门的关键词,一天的时间范围,可以获得**1000万**以上的搜索结果,N天的时间范围就可以获得1000万 X N搜索结果。对于大多数关键词,一天产生的相应微博数量应该在1000万条以下,因此可以说该程序可以获得大部分关键词的全部或近似全部的搜索结果。而且该程序可以获得搜索结果的所有信息,本程序获得的微博信息该程序都能获得。
-## 输出 + +* [获取到的字段](#获取到的字段) +* [实例](#实例) +* [运行环境](#运行环境) +* [使用说明](#使用说明) +* [定期自动爬取微博(可选)](#定期自动爬取微博可选) +* [如何获取cookie](#如何获取cookie) +* [如何获取user_id](#如何获取user_id) +* [如何获取大量user_id](#如何获取大量user_id) +* [相关项目](#相关项目) +* [注意事项](#注意事项) + +## 获取到的字段 本部分为爬取到的字段信息说明,为了与[免cookie版](https://github.com/dataabc/weibo-crawler)区分,下面将两者爬取到的信息都列出来。如果是免cookie版所特有的信息,会有免cookie标注,没有标注的为二者共有的信息。
**用户信息** - 用户id:微博用户id,如"1669879400",其实这个字段本来就是已知字段 @@ -77,91 +76,8 @@
## 实例 -以爬取迪丽热巴的微博为例,我们需要修改**config.json**文件,文件内容如下: -``` -{ - "user_id_list": ["1669879400"], - "filter": 1, - "since_date": "1900-01-01", - "write_mode": ["csv", "txt", "json"], - "pic_download": 1, - "video_download": 1, - "cookie": "your cookie" -} -``` - -对于上述参数的含义以及取值范围,这里仅作简单介绍,详细信息见[程序设置](#2程序设置)。 ->**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](#如何获取user_id);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件、csv文件和json文件,所以它的值为["csv", "txt", "json"],如果你想写入数据库,具体设置见[设置数据库](#3设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[如何获取cookie](#如何获取cookie),获取cookie后把"your cookie"替换成真实的cookie值即可。
+如果想要知道程序的具体运行结果,可以查看[实例文档](https://github.com/dataabc/weiboSpider/blob/master/docs/example.md),该文档介绍了爬取[迪丽热巴](https://weibo.cn/u/1669879400)微博的例子,并附有部分结果文件截图。 -cookie修改完成后在weiboSpider目录下运行如下命令: -```bash -$ python3 -m weibo_spider -``` -程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个json文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](#3设置数据库可选)部分。
-
-**csv结果文件如下所示:** -![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv*
-
-**txt结果文件如下所示:** -![](https://picture.cognize.me/cognize/github/weibospider/weibo_txt.png)*1669879400.txt*
-
-json文件包含迪丽热巴的用户信息和上千条微博信息,内容较多。为了表达清晰,这里仅展示两条微博。
-**json结果文件如下所示:** -``` -{ - "user": { - "id": "1669879400", - "nickname": "Dear-迪丽热巴", - "gender": "女", - "location": "上海", - "birthday": "双子座", - "description": "一只喜欢默默表演的小透明。工作联系jaywalk@jaywalk.com.cn 🍒", - "verified_reason": "嘉行传媒签约演员", - "talent": "", - "education": "上海戏剧学院", - "work": "嘉行传媒 ", - "weibo_num": 1121, - "following": 250, - "followers": 66395910 - }, - "weibo": [ - { - "id": "IonM9ryMy", - "content": "2019#微博之夜#盛典即将开启,以微博之力,让世界更美。1月11日,不见不散@微博之夜  原图 ", - "original_pictures": "http://wx1.sinaimg.cn/large/63885668ly1gao0a01kfzj20ku112k98.jpg", - "video_url": "无", - "publish_place": "无", - "publish_time": "2020-01-07 14:59", - "publish_tool": "无", - "up_num": 239242, - "retweet_num": 71914, - "comment_num": 55916 - }, - { - "id": "InB4Df73X", - "content": "#happyNEOyear#都到了2020,还不换点新pose配新装[來] 穿上@adidasneo 迪士尼联名款,让#生来好动#的我们一起玩“新”大发、自拍不重样🤳http://t.cn/AiF7nREj adidasneo的微博视频  ", - "original_pictures": "无", - "video_url": "http://f.video.weibocdn.com/000pYrGmlx07zPTskBQQ010412008AOY0E010.mp4?label=mp4_hd&template=852x480.25.0&trans_finger=62b30a3f061b162e421008955c73f536&Expires=1578569162&ssig=IV3JEbh3Zu&KID=unistore,video", - "publish_place": "无", - "publish_time": "2020-01-02 11:00", - "publish_tool": "无", - "up_num": 275419, - "retweet_num": 376734, - "comment_num": 131069 - } - ] -} -``` -*1669879400.json*
-
-**下载的图片如下所示:** -![](https://picture.cognize.me/cognize/github/weibospider/img.png)*img文件夹*
-本次下载了793张图片,大小一共1.21GB,包括她原创微博中的图片和转发微博转发理由中的图片。图片名为yyyymmdd+微博id的形式,若某条微博存在多张图片,则图片名中还会包括它在微博图片中的序号。若某张图片因为网络等原因下载失败,程序则会以“weibo_id:pic_url”的形式将出错微博id和图片url写入同文件夹下的not_downloaded.txt里;
-
-**下载的视频如下所示:** -![](https://picture.cognize.me/cognize/github/weibospider/video.png)*video文件夹*
-本次下载了70个视频,是她原创微博中的视频,视频名为yyyymmdd+微博id的形式。其中有一个视频因为网络原因下载失败,程序将它的微博id和视频url以“weibo_id:video_url”的形式写到了同文件夹下的not_downloaded.txt里。
-因为我本地没有安装MySQL数据库和MongoDB数据库,所以暂时设置成不写入数据库。如果你想要将爬取结果写入数据库,只需要先安装数据库(MySQL或MongoDB),再安装对应包(pymysql或pymongo),然后将mysql_write或mongodb_write值设置为1即可。写入MySQL需要用户名、密码等配置信息,这些配置如何设置见[设置数据库](#3设置数据库可选)部分。 ## 运行环境 - 开发语言:python2/python3 - 系统: Windows/Linux/macOS @@ -171,7 +87,7 @@ json文件包含迪丽热巴的用户信息和上千条微博信息,内容较 本程序有两个版本,你现在看到的是python3版,另一个是python2版,python2版位于[python2分支](https://github.com/dataabc/weiboSpider/tree/python2)。目前主力开发python3版,包括新功能开发和bug修复;python2版仅支持bug修复。推荐python3用户使用当前版本,推荐python2用户使用[python2版](https://github.com/dataabc/weiboSpider/tree/python2),本使用说明是python3版的使用说明。
### 1.下载脚本 本程序提供两种下载方式,一种是**源码下载安装**,另一种是**pip安装**,二者功能完全相同。如果你需要修改源码,建议使用第一种方式,否则选哪种安装方式都可以。
-**源码下载安装**
+#### 源码下载安装 下载脚本 ```bash $ git clone https://github.com/dataabc/weibospider.git @@ -181,7 +97,7 @@ $ git clone https://github.com/dataabc/weibospider.git $ pip install -r requirements.txt ``` 运行上述命令,将本项目下载到当前目录,如果下载成功当前目录会出现一个名为"weibospider"的文件夹;
-**pip安装** +#### pip安装 ```bash $ python3 -m pip install weibo-spider ``` @@ -412,26 +328,14 @@ $ python3 -m weibo_spider --config_path="config.json" 推荐第二种方法,本方法是[Evifly](https://github.com/Evifly)想出的,非常热心非常有想法的网友,在此感谢。
## 如何获取cookie -1.用Chrome打开
-2.输入微博的用户名、密码,登录,如图所示: -![](https://picture.cognize.me/cognize/github/weibospider/cookie1.png) -登录成功后会跳转到;
-3.按F12键打开Chrome开发者工具,在地址栏输入并跳转到,跳转后会显示如下类似界面: -![](https://picture.cognize.me/cognize/github/weibospider/cookie2.png) -4.依此点击Chrome开发者工具中的Network->Name中的weibo.cn->Headers->Request Headers,"Cookie:"后的值即为我们要找的cookie值,复制即可,如图所示: -![](https://picture.cognize.me/cognize/github/weibospider/cookie3.png) +要了解获取cookie方法,请查看[cookie文档](https://github.com/dataabc/weiboSpider/blob/master/docs/cookie.md)。 ## 如何获取user_id -1.打开网址,搜索我们要找的人,如"迪丽热巴",进入她的主页;
-![](https://picture.cognize.me/cognize/github/weibospider/user_home.png) -2.按照上图箭头所指,点击"资料"链接,跳转到用户资料页面;
-![](https://picture.cognize.me/cognize/github/weibospider/user_info.png) -如上图所示,迪丽热巴微博资料页的地址为"",其中的"1669879400"即为此微博的user_id。
-事实上,此微博的user_id也包含在用户主页()中,之所以我们还要点击主页中的"资料"来获取user_id,是因为很多用户的主页不是""的形式,而是""或""的形式。其中"微号"和user_id都是一串数字,如果仅仅通过主页地址提取user_id,很容易将"微号"误认为user_id。
-上述可以获得一个user_id,如果想要获得**大量**微博,见[如何获取大量user_id](#如何获取大量user_id)部分。
+要了解获取user_id方法,请查看[user_id文档](https://github.com/dataabc/weiboSpider/blob/master/docs/userid.md),该文档介绍了如何获取一个及多个微博用户user_id的方法。 -## 如何获取大量user_id -[如何获取user_id](#如何获取user_id)部分可以获得一个user_id,可以利用这一个user_id,获取该user_id微博用户关注人的user_id,一个user_id最多可以获得200个user_id,并写入user_id_list.txt文件。程序支持读文件,利用这200个user_id,可以获得最多200X200=40000个user_id。再利用这40000个user_id可以得到40000X200=8000000个user_id,如此反复,以此类推,可以获得大量user_id。本项目也支持读文件,将上述程序的结果文件user_id_list.txt路径赋值给本项目config.json的user_id_list参数,就可以获得这些user_id用户所发布的大量微博。
+## 相关项目 +- [weibo-crawler](https://github.com/dataabc/weibo-crawler) - 功能和本项目完全一样,可以不添加cookie,获取的微博属性更多; +- [weibo-search](https://github.com/dataabc/weibo-search) - 可以连续获取一个或多个**微博关键词搜索**结果,并将结果写入文件(可选)、数据库(可选)等。所谓微博关键词搜索即:**搜索正文中包含指定关键词的微博**,可以指定搜索的时间范围。对于非常热门的关键词,一天的时间范围,可以获得**1000万**以上的搜索结果,N天的时间范围就可以获得1000万 X N搜索结果。对于大多数关键词,一天产生的相应微博数量应该在1000万条以下,因此可以说该程序可以获得大部分关键词的全部或近似全部的搜索结果。而且该程序可以获得搜索结果的所有信息,本程序获得的微博信息该程序都能获得。 ## 注意事项 1.user_id不能为爬虫微博的user_id。因为要爬微博信息,必须先登录到某个微博账号,此账号我们姑且称为爬虫微博。爬虫微博访问自己的页面和访问其他用户的页面,得到的网页格式不同,所以无法爬取自己的微博信息;如果想要爬取爬虫微博内容,可以参考[获取自身微博信息](https://github.com/dataabc/weiboSpider/issues/113)。
diff --git a/docs/cookie.md b/docs/cookie.md new file mode 100644 index 00000000..3a202314 --- /dev/null +++ b/docs/cookie.md @@ -0,0 +1,9 @@ +## 如何获取cookie +1.用Chrome打开
+2.输入微博的用户名、密码,登录,如图所示: +![](https://picture.cognize.me/cognize/github/weibospider/cookie1.png) +登录成功后会跳转到;
+3.按F12键打开Chrome开发者工具,在地址栏输入并跳转到,跳转后会显示如下类似界面: +![](https://picture.cognize.me/cognize/github/weibospider/cookie2.png) +4.依此点击Chrome开发者工具中的Network->Name中的weibo.cn->Headers->Request Headers,"Cookie:"后的值即为我们要找的cookie值,复制即可,如图所示: +![](https://picture.cognize.me/cognize/github/weibospider/cookie3.png) \ No newline at end of file diff --git a/docs/example.md b/docs/example.md new file mode 100644 index 00000000..2a1ea5ad --- /dev/null +++ b/docs/example.md @@ -0,0 +1,86 @@ +## 实例 +以爬取迪丽热巴的微博为例,我们需要修改**config.json**文件,文件内容如下: +``` +{ + "user_id_list": ["1669879400"], + "filter": 1, + "since_date": "1900-01-01", + "write_mode": ["csv", "txt", "json"], + "pic_download": 1, + "video_download": 1, + "cookie": "your cookie" +} +``` + +对于上述参数的含义以及取值范围,这里仅作简单介绍,详细信息见[程序设置](#2程序设置)。 +>**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](#如何获取user_id);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件、csv文件和json文件,所以它的值为["csv", "txt", "json"],如果你想写入数据库,具体设置见[设置数据库](#3设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[如何获取cookie](#如何获取cookie),获取cookie后把"your cookie"替换成真实的cookie值即可。
+ +cookie修改完成后在weiboSpider目录下运行如下命令: +```bash +$ python3 -m weibo_spider +``` +程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个json文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](#3设置数据库可选)部分。
+
+**csv结果文件如下所示:** +![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv*
+
+**txt结果文件如下所示:** +![](https://picture.cognize.me/cognize/github/weibospider/weibo_txt.png)*1669879400.txt*
+
+json文件包含迪丽热巴的用户信息和上千条微博信息,内容较多。为了表达清晰,这里仅展示两条微博。
+**json结果文件如下所示:** +``` +{ + "user": { + "id": "1669879400", + "nickname": "Dear-迪丽热巴", + "gender": "女", + "location": "上海", + "birthday": "双子座", + "description": "一只喜欢默默表演的小透明。工作联系jaywalk@jaywalk.com.cn 🍒", + "verified_reason": "嘉行传媒签约演员", + "talent": "", + "education": "上海戏剧学院", + "work": "嘉行传媒 ", + "weibo_num": 1121, + "following": 250, + "followers": 66395910 + }, + "weibo": [ + { + "id": "IonM9ryMy", + "content": "2019#微博之夜#盛典即将开启,以微博之力,让世界更美。1月11日,不见不散@微博之夜  原图 ", + "original_pictures": "http://wx1.sinaimg.cn/large/63885668ly1gao0a01kfzj20ku112k98.jpg", + "video_url": "无", + "publish_place": "无", + "publish_time": "2020-01-07 14:59", + "publish_tool": "无", + "up_num": 239242, + "retweet_num": 71914, + "comment_num": 55916 + }, + { + "id": "InB4Df73X", + "content": "#happyNEOyear#都到了2020,还不换点新pose配新装[來] 穿上@adidasneo 迪士尼联名款,让#生来好动#的我们一起玩“新”大发、自拍不重样🤳http://t.cn/AiF7nREj adidasneo的微博视频  ", + "original_pictures": "无", + "video_url": "http://f.video.weibocdn.com/000pYrGmlx07zPTskBQQ010412008AOY0E010.mp4?label=mp4_hd&template=852x480.25.0&trans_finger=62b30a3f061b162e421008955c73f536&Expires=1578569162&ssig=IV3JEbh3Zu&KID=unistore,video", + "publish_place": "无", + "publish_time": "2020-01-02 11:00", + "publish_tool": "无", + "up_num": 275419, + "retweet_num": 376734, + "comment_num": 131069 + } + ] +} +``` +*1669879400.json*
+
+**下载的图片如下所示:** +![](https://picture.cognize.me/cognize/github/weibospider/img.png)*img文件夹*
+本次下载了793张图片,大小一共1.21GB,包括她原创微博中的图片和转发微博转发理由中的图片。图片名为yyyymmdd+微博id的形式,若某条微博存在多张图片,则图片名中还会包括它在微博图片中的序号。若某张图片因为网络等原因下载失败,程序则会以“weibo_id:pic_url”的形式将出错微博id和图片url写入同文件夹下的not_downloaded.txt里;
+
+**下载的视频如下所示:** +![](https://picture.cognize.me/cognize/github/weibospider/video.png)*video文件夹*
+本次下载了70个视频,是她原创微博中的视频,视频名为yyyymmdd+微博id的形式。其中有一个视频因为网络原因下载失败,程序将它的微博id和视频url以“weibo_id:video_url”的形式写到了同文件夹下的not_downloaded.txt里。
+因为我本地没有安装MySQL数据库和MongoDB数据库,所以暂时设置成不写入数据库。如果你想要将爬取结果写入数据库,只需要先安装数据库(MySQL或MongoDB),再安装对应包(pymysql或pymongo),然后将mysql_write或mongodb_write值设置为1即可。写入MySQL需要用户名、密码等配置信息,这些配置如何设置见[设置数据库](#3设置数据库可选)部分。 \ No newline at end of file diff --git a/docs/userid.md b/docs/userid.md new file mode 100644 index 00000000..7fc2a2c4 --- /dev/null +++ b/docs/userid.md @@ -0,0 +1,11 @@ +## 如何获取user_id +1.打开网址,搜索我们要找的人,如"迪丽热巴",进入她的主页;
+![](https://picture.cognize.me/cognize/github/weibospider/user_home.png) +2.按照上图箭头所指,点击"资料"链接,跳转到用户资料页面;
+![](https://picture.cognize.me/cognize/github/weibospider/user_info.png) +如上图所示,迪丽热巴微博资料页的地址为"",其中的"1669879400"即为此微博的user_id。
+事实上,此微博的user_id也包含在用户主页()中,之所以我们还要点击主页中的"资料"来获取user_id,是因为很多用户的主页不是""的形式,而是""或""的形式。其中"微号"和user_id都是一串数字,如果仅仅通过主页地址提取user_id,很容易将"微号"误认为user_id。
+上述可以获得一个user_id,如果想要获得**大量**微博,见[如何获取大量user_id](#如何获取大量user_id)部分。
+ +## 如何获取大量user_id +[如何获取user_id](#如何获取user_id)部分可以获得一个user_id,可以利用这一个user_id,获取该user_id微博用户关注人的user_id,一个user_id最多可以获得200个user_id,并写入user_id_list.txt文件。程序支持读文件,利用这200个user_id,可以获得最多200X200=40000个user_id。再利用这40000个user_id可以得到40000X200=8000000个user_id,如此反复,以此类推,可以获得大量user_id。本项目也支持读文件,将上述程序的结果文件user_id_list.txt路径赋值给本项目config.json的user_id_list参数,就可以获得这些user_id用户所发布的大量微博。
\ No newline at end of file diff --git a/setup.py b/setup.py index 54acda50..456b9b27 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='weibo-spider', - version='0.0.7', + version='0.0.8', author='Chen Lei', author_email='chillychen1991@gmail.com', description='新浪微博爬虫,用python爬取新浪微博数据。', From c4439342942e4765d7923530e86fc105413a0971 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 9 Jun 2020 21:11:04 +0800 Subject: [PATCH 183/363] Create settings.md --- docs/settings.md | 128 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 docs/settings.md diff --git a/docs/settings.md b/docs/settings.md new file mode 100644 index 00000000..b6fff127 --- /dev/null +++ b/docs/settings.md @@ -0,0 +1,128 @@ +## 程序设置 +**源码下载安装**的用户在weiboSpider目录下运行如下命令,**pip安装**的用户在任意有写权限的目录运行如下命令: +```bash +$ python3 -m weibo_spider +``` +第一次运行会生成**config.json**文件,请打开**config.json**文件,你会看到如下内容: +``` +{ + "user_id_list": ["1669879400"], + "filter": 1, + "since_date": "2018-01-01", + "write_mode": ["csv", "txt"], + "pic_download": 1, + "video_download": 1, + "cookie": "your cookie", + "mysql_config": { + "host": "localhost", + "port": 3306, + "user": "root", + "password": "123456", + "charset": "utf8mb4" + } +} +``` +下面讲解每个参数的含义与设置方法。
+**设置user_id_list**
+user_id_list是我们要爬取的微博的id,可以是一个,也可以是多个,例如: +``` +"user_id_list": ["1223178222", "1669879400", "1729370543"], +``` +上述代码代表我们要连续爬取user_id分别为“1223178222”、 “1669879400”、 “1729370543”的三个用户的微博,具体如何获取user_id见[如何获取user_id](#如何获取user_id)。
+user_id_list的值也可以是文件路径,我们可以把要爬的所有微博用户的user_id都写到txt文件里,然后把文件的位置路径赋值给user_id_list,**推荐这种方式**。
+在txt文件中,每个user_id占一行,也可以在user_id后面加注释(可选),如用户昵称等信息,user_id和注释之间必需要有空格,文件名任意,类型为txt,位置位于本程序的同目录下,文件内容示例如下: +``` +1223178222 胡歌 +1669879400 迪丽热巴 +1729370543 郭碧婷 +``` +假如文件叫user_id_list.txt,则user_id_list设置代码为: +``` +"user_id_list": "user_id_list.txt", +``` +**设置filter**
+filter控制爬取范围,值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发)。例如,如果要爬全部原创微博,请使用如下代码: +``` +"filter": 1, +``` +**设置since_date**
+since_date值可以是日期,也可以是整数。如果是日期,代表爬取该日期之后的微博,格式应为“yyyy-mm-dd”,如: +``` +"since_date": "2018-01-01", +``` +代表爬取从2018年1月1日到现在的微博。
+如果是整数,代表爬取最近n天的微博,如: +``` +"since_date": 10, +``` +代表爬取最近10天的微博,这个说法不是特别准确,准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。
+**since_date是所有user的爬取起始时间,非常不灵活。如果你要爬多个用户,并且想单独为每个用户设置一个since_date,可以使用[定期自动爬取微博](#定期自动爬取微博可选)方法二中的方法,该方法可以为多个用户设置不同的since_date,非常灵活。**
+**设置write_mode**
+write_mode控制结果文件格式,取值范围是csv、txt、json、mongo和mysql,分别代表将结果文件写入csv、txt、json、MongoDB和MySQL数据库。write_mode可以同时包含这些取值中的一个或几个,如: +``` +"write_mode": ["csv", "txt"], +``` +代表将结果信息写入csv文件和txt文件。特别注意,如果你想写入数据库,除了在write_mode添加对应数据库的名字外,还应该安装相关数据库和对应python模块,具体操作见[设置数据库](#3设置数据库可选)部分。
+**设置pic_download**
+pic_download控制是否下载微博中的图片,值为1代表下载,值为0代表不下载,如 +``` +"pic_download": 1, +``` +代表下载微博中的图片。
+**设置video_download**
+video_download控制是否下载微博中的视频,值为1代表下载,值为0代表不下载,如 +``` +"video_download": 1, +``` +代表下载微博中的视频。
+**设置cookie**
+请按照[如何获取cookie](#如何获取cookie),获取cookie,然后将“your cookie”替换成真实的cookie值。
+**设置mysql_config(可选)**
+mysql_config控制mysql参数配置。如果你不需要将结果信息写入mysql,这个参数可以忽略,即删除或保留都无所谓;如果你需要写入mysql且config.json文件中mysql_config的配置与你的mysql配置不一样,请将该值改成你自己mysql中的参数配置。 + +## 设置数据库(可选) +本部分是可选部分,如果不需要将爬取信息写入数据库,可跳过这一步。本程序目前支持MySQL数据库和MongoDB数据库,如果你需要写入其它数据库,可以参考这两个数据库的写法自己编写。
+**MySQL数据库写入**
+要想将爬取信息写入MySQL,请根据自己的系统环境安装MySQL,然后命令行执行: +```bash +$ pip install pymysql +``` +**MongoDB数据库写入**
+要想将爬取信息写入MongoDB,请根据自己的系统环境安装MongoDB,然后命令行执行: +```bash +$ pip install pymongo +``` +MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为"weibo"的数据库,然后再创建"user"表和"weibo"表,包含爬取的所有内容。爬取到的微博**用户信息**或插入或更新,都会存储到user表里;爬取到的**微博信息**或插入或更新,都会存储到weibo表里,两个表通过user_id关联。如果想了解两个表的具体字段,请点击"详情"。 +
+详情 + +**user表**
+**id**:存储用户id,如"1669879400";
+**nickname**:存储用户昵称,如"Dear-迪丽热巴";
+**gender**:存储用户性别;
+**location**:存储用户所在地;
+**birthday**:存储用户出生日期;
+**description**:存储用户简介;
+**verified_reason**:存储用户认证;
+**talent**:存储用户标签;
+**education**:存储用户学习经历;
+**work**:存储用户工作经历;
+**weibo_num**:存储微博数;
+**following**:存储关注数;
+**followers**:存储粉丝数。
+*** +**weibo表**
+**id**:存储微博id;
+**user_id**:存储微博发布者的用户id,如"1669879400";
+**content**:存储微博正文;
+**article_url**:存储微博中头条文章的url,若微博中不存在头条文章,则值为'';
+**original_pictures**:存储原创微博的原始图片url和转发微博转发理由中的图片url。若某条微博有多张图片,则存储多个url,以英文逗号分割;若某微博没有图片,则值为"无";
+**retweet_pictures**:存储被转发微博中的原始图片url。当最新微博为原创微博或者为没有图片的转发微博时,则值为"无",否则为被转发微博的图片url。若有多张图片,则存储多个url,以英文逗号分割;
+**publish_place**:存储微博的发布位置。如果某条微博没有位置信息,则值为"无";
+**publish_time**:存储微博的发布时间;
+**up_num**:存储微博获得的点赞数;
+**retweet_num**:存储微博获得的转发数;
+**comment_num**:存储微博获得的评论数;
+**publish_tool**:存储微博的发布工具。 + +
From d408d8aee4c0c1585cf123bdb0d843ce57f8ece3 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 9 Jun 2020 21:17:24 +0800 Subject: [PATCH 184/363] Create automation.md --- docs/automation.md | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 docs/automation.md diff --git a/docs/automation.md b/docs/automation.md new file mode 100644 index 00000000..5251fb56 --- /dev/null +++ b/docs/automation.md @@ -0,0 +1,40 @@ +## 定期自动爬取微博(可选) +我们爬取了微博以后,很多微博账号又可能发了一些新微博,定期自动爬取微博就是每隔一段时间自动运行程序,自动爬取这段时间产生的新微博(忽略以前爬过的旧微博)。本部分为可选部分,如果不需要可以忽略。
+思路是**利用第三方软件,如crontab,让程序每隔一段时间运行一次**。因为是要跳过以前爬过的旧微博,只爬新微博。所以需要**设置一个动态的since_date**。很多时候我们使用的since_date是固定的,比如since_date="2018-01-01",程序就会按照这个设置从最新的微博一直爬到发布时间为2018-01-01的微博(包括这个时间)。因为我们想追加新微博,跳过旧微博。第二次爬取时since_date值就应该是当前时间到上次爬取的时间。 +如果我们使用最原始的方式实现追加爬取,应该是这样: +``` +假如程序第一次执行时间是2019-06-06,since_date假如为2018-01-01,那这一次就是爬取从2018-01-01到2019-06-06这段时间用户所发的微博; +第二次爬取,我们想要接着上次的爬,那since_date的值应该是上次程序执行的日期,即2019-06-06 +``` +上面的方法太麻烦,因为每次都要手动设置since_date。因此我们需要动态设置since_date,即程序根据实际情况,自动生成since_date。
+有两种方法实现动态更新since_date,**推荐使用方法二**。
+**方法一:将since_date设置成整数**
+将config.json文件中的since_date设置成整数,如: +``` +"since_date": 10, +``` +这个配置告诉程序爬取最近10天的微博,更准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。这样since_date就是一个动态的变量,每次程序执行时,它的值就是当前日期减10。配合crontab每9天或10天执行一次,就实现了定期追加爬取。
+**方法二:将上次执行程序的时间写入文件(推荐)**
+这个方法很简单,就是使用[程序设置](#2程序设置)中**设置user_id_list**的第二种方法设置user_id_list,这样设置就全部结束了。
+说下这个方法的好处和原理,假如你的txt文件内容为: +``` +1669879400 +1223178222 胡歌 +1729370543 郭碧婷 2019-01-01 19:28 +``` +第一次执行时,因为第一行和第二行都没有写时间,程序会按照config.json文件中since_date的值爬取,第三行有时间“2019-01-01 19:28”,程序就会把这个时间当作since_date。每个用户爬取结束程序都会自动更新txt文件,每一行第一部分是user_id,第二部分是用户昵称,第三部分是程序**准备**爬取该用户第一条微博(最新微博)时的时间。爬完三个用户后,txt文件的内容自动更新为: +``` +1669879400 Dear-迪丽热巴 2020-01-13 19:18 +1223178222 胡歌 2020-01-13 19:28 +1729370543 郭碧婷 2020-01-13 19:33 +``` +下次再爬取微博的时候,程序会把每行的时间数据作为since_date。这样的好处一是不用修改since_date,程序自动更新;二是每一个用户都可以单独拥有只属于自己的since_date,每个用户的since_date相互独立,互不干扰。since_date既可以是“yyyy-mm-dd”格式,也可以是“yyyy-mm-dd hh:mm”格式。比如,现在又添加了一个新用户,例如杨紫,你想获取她从2018-01-23到现在的全部微博,只需要这样修改txt文件: +``` +1669879400 Dear-迪丽热巴 2020-01-13 19:18 +1223178222 胡歌 2020-01-13 19:28 +1729370543 郭碧婷 2020-01-13 19:33 +1227368500 杨紫 2018-01-23 +``` +注意每一行的用户配置参数以空格分隔,如果第一个参数全部由数字组成,程序就认为此行为一个用户的配置,否则程序会认为该行只是注释,跳过该行;第二个参数可以为任意格式,建议写用户昵称;第三个如果是日期格式(yyyy-mm-dd),程序就将该日期设置为用户自己的since_date,否则使用config.json中的since_date爬取该用户的微博,第二个参数和第三个参数也可以不填。 + +推荐第二种方法,本方法是[Evifly](https://github.com/Evifly)想出的,非常热心非常有想法的网友,在此感谢。
From 3802319d0dbc6c597f6117271aaea72b9450e60a Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 9 Jun 2020 21:24:38 +0800 Subject: [PATCH 185/363] Update README.md --- README.md | 177 ++---------------------------------------------------- 1 file changed, 6 insertions(+), 171 deletions(-) diff --git a/README.md b/README.md index e65e502e..a6f3e927 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,6 @@ 当然,如果你只对用户信息感兴趣,而不需要爬用户的微博,也可以通过设置实现只爬取微博用户信息的功能。
程序也可以实现**爬取结果自动更新**,即:现在爬取了目标用户的微博,几天之后,目标用户可能又发新微博了。通过设置,可以实现每隔几天**增量爬取**用户这几天发的新微博。具体方法见[定期自动爬取微博](#定期自动爬取微博可选)。
本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需[免cookie版](https://github.com/dataabc/weibo-crawler),大家可以访问,二者功能类似,免cookie版获取的信息更多,用法更简单,而且不需要cookie。
-如果想要获得**大量**微博,见[如何获取大量user_id](#如何获取大量user_id)部分。
* [获取到的字段](#获取到的字段) * [实例](#实例) @@ -25,7 +24,6 @@ * [定期自动爬取微博(可选)](#定期自动爬取微博可选) * [如何获取cookie](#如何获取cookie) * [如何获取user_id](#如何获取user_id) -* [如何获取大量user_id](#如何获取大量user_id) * [相关项目](#相关项目) * [注意事项](#注意事项) @@ -76,7 +74,7 @@
## 实例 -如果想要知道程序的具体运行结果,可以查看[实例文档](https://github.com/dataabc/weiboSpider/blob/master/docs/example.md),该文档介绍了爬取[迪丽热巴](https://weibo.cn/u/1669879400)微博的例子,并附有部分结果文件截图。 +如果想要知道程序的具体运行结果,可以查看[实例文档](https://github.com/dataabc/weiboSpider/blob/master/docs/example.md),该文档介绍了爬取[迪丽热巴微博](https://weibo.cn/u/1669879400)的例子,并附有部分结果文件截图。 ## 运行环境 - 开发语言:python2/python3 @@ -102,135 +100,9 @@ $ pip install -r requirements.txt $ python3 -m pip install weibo-spider ``` ### 2.程序设置 -**源码下载安装**的用户在weiboSpider目录下运行如下命令,**pip安装**的用户在任意有写权限的目录运行如下命令: -```bash -$ python3 -m weibo_spider -``` -第一次运行会生成**config.json**文件,请打开**config.json**文件,你会看到如下内容: -``` -{ - "user_id_list": ["1669879400"], - "filter": 1, - "since_date": "2018-01-01", - "write_mode": ["csv", "txt"], - "pic_download": 1, - "video_download": 1, - "cookie": "your cookie", - "mysql_config": { - "host": "localhost", - "port": 3306, - "user": "root", - "password": "123456", - "charset": "utf8mb4" - } -} -``` -下面讲解每个参数的含义与设置方法。
-**设置user_id_list**
-user_id_list是我们要爬取的微博的id,可以是一个,也可以是多个,例如: -``` -"user_id_list": ["1223178222", "1669879400", "1729370543"], -``` -上述代码代表我们要连续爬取user_id分别为“1223178222”、 “1669879400”、 “1729370543”的三个用户的微博,具体如何获取user_id见[如何获取user_id](#如何获取user_id)。
-user_id_list的值也可以是文件路径,我们可以把要爬的所有微博用户的user_id都写到txt文件里,然后把文件的位置路径赋值给user_id_list,**推荐这种方式**。
-在txt文件中,每个user_id占一行,也可以在user_id后面加注释(可选),如用户昵称等信息,user_id和注释之间必需要有空格,文件名任意,类型为txt,位置位于本程序的同目录下,文件内容示例如下: -``` -1223178222 胡歌 -1669879400 迪丽热巴 -1729370543 郭碧婷 -``` -假如文件叫user_id_list.txt,则user_id_list设置代码为: -``` -"user_id_list": "user_id_list.txt", -``` -**设置filter**
-filter控制爬取范围,值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发)。例如,如果要爬全部原创微博,请使用如下代码: -``` -"filter": 1, -``` -**设置since_date**
-since_date值可以是日期,也可以是整数。如果是日期,代表爬取该日期之后的微博,格式应为“yyyy-mm-dd”,如: -``` -"since_date": "2018-01-01", -``` -代表爬取从2018年1月1日到现在的微博。
-如果是整数,代表爬取最近n天的微博,如: -``` -"since_date": 10, -``` -代表爬取最近10天的微博,这个说法不是特别准确,准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。
-**since_date是所有user的爬取起始时间,非常不灵活。如果你要爬多个用户,并且想单独为每个用户设置一个since_date,可以使用[定期自动爬取微博](#定期自动爬取微博可选)方法二中的方法,该方法可以为多个用户设置不同的since_date,非常灵活。**
-**设置write_mode**
-write_mode控制结果文件格式,取值范围是csv、txt、json、mongo和mysql,分别代表将结果文件写入csv、txt、json、MongoDB和MySQL数据库。write_mode可以同时包含这些取值中的一个或几个,如: -``` -"write_mode": ["csv", "txt"], -``` -代表将结果信息写入csv文件和txt文件。特别注意,如果你想写入数据库,除了在write_mode添加对应数据库的名字外,还应该安装相关数据库和对应python模块,具体操作见[设置数据库](#3设置数据库可选)部分。
-**设置pic_download**
-pic_download控制是否下载微博中的图片,值为1代表下载,值为0代表不下载,如 -``` -"pic_download": 1, -``` -代表下载微博中的图片。
-**设置video_download**
-video_download控制是否下载微博中的视频,值为1代表下载,值为0代表不下载,如 -``` -"video_download": 1, -``` -代表下载微博中的视频。
-**设置cookie**
-请按照[如何获取cookie](#如何获取cookie),获取cookie,然后将“your cookie”替换成真实的cookie值。
-**设置mysql_config(可选)**
-mysql_config控制mysql参数配置。如果你不需要将结果信息写入mysql,这个参数可以忽略,即删除或保留都无所谓;如果你需要写入mysql且config.json文件中mysql_config的配置与你的mysql配置不一样,请将该值改成你自己mysql中的参数配置。 - -### 3.设置数据库(可选) -本部分是可选部分,如果不需要将爬取信息写入数据库,可跳过这一步。本程序目前支持MySQL数据库和MongoDB数据库,如果你需要写入其它数据库,可以参考这两个数据库的写法自己编写。
-**MySQL数据库写入**
-要想将爬取信息写入MySQL,请根据自己的系统环境安装MySQL,然后命令行执行: -```bash -$ pip install pymysql -``` -**MongoDB数据库写入**
-要想将爬取信息写入MongoDB,请根据自己的系统环境安装MongoDB,然后命令行执行: -```bash -$ pip install pymongo -``` -MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为"weibo"的数据库,然后再创建"user"表和"weibo"表,包含爬取的所有内容。爬取到的微博**用户信息**或插入或更新,都会存储到user表里;爬取到的**微博信息**或插入或更新,都会存储到weibo表里,两个表通过user_id关联。如果想了解两个表的具体字段,请点击"详情"。 -
-详情 - -**user表**
-**id**:存储用户id,如"1669879400";
-**nickname**:存储用户昵称,如"Dear-迪丽热巴";
-**gender**:存储用户性别;
-**location**:存储用户所在地;
-**birthday**:存储用户出生日期;
-**description**:存储用户简介;
-**verified_reason**:存储用户认证;
-**talent**:存储用户标签;
-**education**:存储用户学习经历;
-**work**:存储用户工作经历;
-**weibo_num**:存储微博数;
-**following**:存储关注数;
-**followers**:存储粉丝数。
-*** -**weibo表**
-**id**:存储微博id;
-**user_id**:存储微博发布者的用户id,如"1669879400";
-**content**:存储微博正文;
-**article_url**:存储微博中头条文章的url,若微博中不存在头条文章,则值为'';
-**original_pictures**:存储原创微博的原始图片url和转发微博转发理由中的图片url。若某条微博有多张图片,则存储多个url,以英文逗号分割;若某微博没有图片,则值为"无";
-**retweet_pictures**:存储被转发微博中的原始图片url。当最新微博为原创微博或者为没有图片的转发微博时,则值为"无",否则为被转发微博的图片url。若有多张图片,则存储多个url,以英文逗号分割;
-**publish_place**:存储微博的发布位置。如果某条微博没有位置信息,则值为"无";
-**publish_time**:存储微博的发布时间;
-**up_num**:存储微博获得的点赞数;
-**retweet_num**:存储微博获得的转发数;
-**comment_num**:存储微博获得的评论数;
-**publish_tool**:存储微博的发布工具。 - -
+要了解程序设置,请查看[程序设置文档](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md)。 -### 4.运行脚本 +### 3.运行脚本 **源码下载安装**的用户可以在weiboSpider目录运行如下命令,**pip安装**的用户可以在任意有写权限的目录运行如下命令 ```bash $ python3 -m weibo_spider @@ -239,7 +111,7 @@ $ python3 -m weibo_spider ```bash $ python3 -m weibo_spider --config_path="config.json" ``` -### 5.按需求修改脚本(可选) +### 4.按需求修改脚本(可选) 本部分为可选部分,如果你不需要自己修改代码或添加新功能,可以忽略此部分。
本程序所有代码都位于weiboSpider.py文件,程序主体是一个Weibo类,上述所有功能都是通过在main函数调用Weibo类实现的,默认的调用代码如下: ```python @@ -288,45 +160,8 @@ $ python3 -m weibo_spider --config_path="config.json" ## 定期自动爬取微博(可选) -我们爬取了微博以后,很多微博账号又可能发了一些新微博,定期自动爬取微博就是每隔一段时间自动运行程序,自动爬取这段时间产生的新微博(忽略以前爬过的旧微博)。本部分为可选部分,如果不需要可以忽略。
-思路是**利用第三方软件,如crontab,让程序每隔一段时间运行一次**。因为是要跳过以前爬过的旧微博,只爬新微博。所以需要**设置一个动态的since_date**。很多时候我们使用的since_date是固定的,比如since_date="2018-01-01",程序就会按照这个设置从最新的微博一直爬到发布时间为2018-01-01的微博(包括这个时间)。因为我们想追加新微博,跳过旧微博。第二次爬取时since_date值就应该是当前时间到上次爬取的时间。 -如果我们使用最原始的方式实现追加爬取,应该是这样: -``` -假如程序第一次执行时间是2019-06-06,since_date假如为2018-01-01,那这一次就是爬取从2018-01-01到2019-06-06这段时间用户所发的微博; -第二次爬取,我们想要接着上次的爬,那since_date的值应该是上次程序执行的日期,即2019-06-06 -``` -上面的方法太麻烦,因为每次都要手动设置since_date。因此我们需要动态设置since_date,即程序根据实际情况,自动生成since_date。
-有两种方法实现动态更新since_date,**推荐使用方法二**。
-**方法一:将since_date设置成整数**
-将config.json文件中的since_date设置成整数,如: -``` -"since_date": 10, -``` -这个配置告诉程序爬取最近10天的微博,更准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。这样since_date就是一个动态的变量,每次程序执行时,它的值就是当前日期减10。配合crontab每9天或10天执行一次,就实现了定期追加爬取。
-**方法二:将上次执行程序的时间写入文件(推荐)**
-这个方法很简单,就是使用[程序设置](#2程序设置)中**设置user_id_list**的第二种方法设置user_id_list,这样设置就全部结束了。
-说下这个方法的好处和原理,假如你的txt文件内容为: -``` -1669879400 -1223178222 胡歌 -1729370543 郭碧婷 2019-01-01 19:28 -``` -第一次执行时,因为第一行和第二行都没有写时间,程序会按照config.json文件中since_date的值爬取,第三行有时间“2019-01-01 19:28”,程序就会把这个时间当作since_date。每个用户爬取结束程序都会自动更新txt文件,每一行第一部分是user_id,第二部分是用户昵称,第三部分是程序**准备**爬取该用户第一条微博(最新微博)时的时间。爬完三个用户后,txt文件的内容自动更新为: -``` -1669879400 Dear-迪丽热巴 2020-01-13 19:18 -1223178222 胡歌 2020-01-13 19:28 -1729370543 郭碧婷 2020-01-13 19:33 -``` -下次再爬取微博的时候,程序会把每行的时间数据作为since_date。这样的好处一是不用修改since_date,程序自动更新;二是每一个用户都可以单独拥有只属于自己的since_date,每个用户的since_date相互独立,互不干扰。since_date既可以是“yyyy-mm-dd”格式,也可以是“yyyy-mm-dd hh:mm”格式。比如,现在又添加了一个新用户,例如杨紫,你想获取她从2018-01-23到现在的全部微博,只需要这样修改txt文件: -``` -1669879400 Dear-迪丽热巴 2020-01-13 19:18 -1223178222 胡歌 2020-01-13 19:28 -1729370543 郭碧婷 2020-01-13 19:33 -1227368500 杨紫 2018-01-23 -``` -注意每一行的用户配置参数以空格分隔,如果第一个参数全部由数字组成,程序就认为此行为一个用户的配置,否则程序会认为该行只是注释,跳过该行;第二个参数可以为任意格式,建议写用户昵称;第三个如果是日期格式(yyyy-mm-dd),程序就将该日期设置为用户自己的since_date,否则使用config.json中的since_date爬取该用户的微博,第二个参数和第三个参数也可以不填。 +要想让程序每个一段时间自动爬取,且爬取的内容为新增加的内容(不包括已经获取的微博),请查看[定期自动爬取微博](https://github.com/dataabc/weiboSpider/blob/master/docs/automation.md)。 -推荐第二种方法,本方法是[Evifly](https://github.com/Evifly)想出的,非常热心非常有想法的网友,在此感谢。
## 如何获取cookie 要了解获取cookie方法,请查看[cookie文档](https://github.com/dataabc/weiboSpider/blob/master/docs/cookie.md)。 @@ -338,5 +173,5 @@ $ python3 -m weibo_spider --config_path="config.json" - [weibo-search](https://github.com/dataabc/weibo-search) - 可以连续获取一个或多个**微博关键词搜索**结果,并将结果写入文件(可选)、数据库(可选)等。所谓微博关键词搜索即:**搜索正文中包含指定关键词的微博**,可以指定搜索的时间范围。对于非常热门的关键词,一天的时间范围,可以获得**1000万**以上的搜索结果,N天的时间范围就可以获得1000万 X N搜索结果。对于大多数关键词,一天产生的相应微博数量应该在1000万条以下,因此可以说该程序可以获得大部分关键词的全部或近似全部的搜索结果。而且该程序可以获得搜索结果的所有信息,本程序获得的微博信息该程序都能获得。 ## 注意事项 -1.user_id不能为爬虫微博的user_id。因为要爬微博信息,必须先登录到某个微博账号,此账号我们姑且称为爬虫微博。爬虫微博访问自己的页面和访问其他用户的页面,得到的网页格式不同,所以无法爬取自己的微博信息;如果想要爬取爬虫微博内容,可以参考[获取自身微博信息](https://github.com/dataabc/weiboSpider/issues/113)。
+1.user_id不能为爬虫微博的user_id。因为要爬微博信息,必须先登录到某个微博账号,此账号我们姑且称为爬虫微博。爬虫微博访问自己的页面和访问其他用户的页面,得到的网页格式不同,所以无法爬取自己的微博信息;如果想要爬取爬虫微博内容,可以参考[获取自身微博信息](https://github.com/dataabc/weiboSpider/issues/113);
2.cookie有期限限制,超过有效期需重新更新cookie。 From 661158ba9e4a703caf9c47230c7ecebbb7d1ec6f Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 9 Jun 2020 21:34:47 +0800 Subject: [PATCH 186/363] Update example.md --- docs/example.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/example.md b/docs/example.md index 2a1ea5ad..e55ecac3 100644 --- a/docs/example.md +++ b/docs/example.md @@ -12,14 +12,14 @@ } ``` -对于上述参数的含义以及取值范围,这里仅作简单介绍,详细信息见[程序设置](#2程序设置)。 ->**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](#如何获取user_id);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件、csv文件和json文件,所以它的值为["csv", "txt", "json"],如果你想写入数据库,具体设置见[设置数据库](#3设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[如何获取cookie](#如何获取cookie),获取cookie后把"your cookie"替换成真实的cookie值即可。
+对于上述参数的含义以及取值范围,这里仅作简单介绍,详细信息见[程序设置](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md)。 +>**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](https://github.com/dataabc/weiboSpider/blob/master/docs/userid.md);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件、csv文件和json文件,所以它的值为["csv", "txt", "json"],如果你想写入数据库,具体设置见[设置数据库](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md#设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[cookie文档](https://github.com/dataabc/weiboSpider/blob/master/docs/cookie.md),获取cookie后把"your cookie"替换成真实的cookie值即可。
cookie修改完成后在weiboSpider目录下运行如下命令: ```bash $ python3 -m weibo_spider ``` -程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个json文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](#3设置数据库可选)部分。
+程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个json文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md#设置数据库可选)部分。

**csv结果文件如下所示:** ![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv*
@@ -83,4 +83,4 @@ json文件包含迪丽热巴的用户信息和上千条微博信息,内容较 **下载的视频如下所示:** ![](https://picture.cognize.me/cognize/github/weibospider/video.png)*video文件夹*
本次下载了70个视频,是她原创微博中的视频,视频名为yyyymmdd+微博id的形式。其中有一个视频因为网络原因下载失败,程序将它的微博id和视频url以“weibo_id:video_url”的形式写到了同文件夹下的not_downloaded.txt里。
-因为我本地没有安装MySQL数据库和MongoDB数据库,所以暂时设置成不写入数据库。如果你想要将爬取结果写入数据库,只需要先安装数据库(MySQL或MongoDB),再安装对应包(pymysql或pymongo),然后将mysql_write或mongodb_write值设置为1即可。写入MySQL需要用户名、密码等配置信息,这些配置如何设置见[设置数据库](#3设置数据库可选)部分。 \ No newline at end of file +因为我本地没有安装MySQL数据库和MongoDB数据库,所以暂时设置成不写入数据库。如果你想要将爬取结果写入数据库,只需要先安装数据库(MySQL或MongoDB),再安装对应包(pymysql或pymongo),然后将mysql_write或mongodb_write值设置为1即可。写入MySQL需要用户名、密码等配置信息,这些配置如何设置见[设置数据库](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md#设置数据库可选)部分。 From bdf130e3ca6ccac8df43e1f6daf8c067ad563bb0 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 9 Jun 2020 21:36:09 +0800 Subject: [PATCH 187/363] Update automation.md --- docs/automation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/automation.md b/docs/automation.md index 5251fb56..933a9ff9 100644 --- a/docs/automation.md +++ b/docs/automation.md @@ -15,7 +15,7 @@ ``` 这个配置告诉程序爬取最近10天的微博,更准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。这样since_date就是一个动态的变量,每次程序执行时,它的值就是当前日期减10。配合crontab每9天或10天执行一次,就实现了定期追加爬取。
**方法二:将上次执行程序的时间写入文件(推荐)**
-这个方法很简单,就是使用[程序设置](#2程序设置)中**设置user_id_list**的第二种方法设置user_id_list,这样设置就全部结束了。
+这个方法很简单,就是使用[程序设置](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md)中**设置user_id_list**的第二种方法设置user_id_list,这样设置就全部结束了。
说下这个方法的好处和原理,假如你的txt文件内容为: ``` 1669879400 From 993ffdf3befed6b9645b6636a684d65a6985d262 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 9 Jun 2020 21:42:17 +0800 Subject: [PATCH 188/363] Update settings.md --- docs/settings.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/settings.md b/docs/settings.md index b6fff127..77e6eec3 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -28,7 +28,7 @@ user_id_list是我们要爬取的微博的id,可以是一个,也可以是多 ``` "user_id_list": ["1223178222", "1669879400", "1729370543"], ``` -上述代码代表我们要连续爬取user_id分别为“1223178222”、 “1669879400”、 “1729370543”的三个用户的微博,具体如何获取user_id见[如何获取user_id](#如何获取user_id)。
+上述代码代表我们要连续爬取user_id分别为“1223178222”、 “1669879400”、 “1729370543”的三个用户的微博,具体如何获取user_id见[如何获取user_id](https://github.com/dataabc/weiboSpider/blob/master/docs/userid.md)。
user_id_list的值也可以是文件路径,我们可以把要爬的所有微博用户的user_id都写到txt文件里,然后把文件的位置路径赋值给user_id_list,**推荐这种方式**。
在txt文件中,每个user_id占一行,也可以在user_id后面加注释(可选),如用户昵称等信息,user_id和注释之间必需要有空格,文件名任意,类型为txt,位置位于本程序的同目录下,文件内容示例如下: ``` @@ -56,13 +56,13 @@ since_date值可以是日期,也可以是整数。如果是日期,代表爬 "since_date": 10, ``` 代表爬取最近10天的微博,这个说法不是特别准确,准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。
-**since_date是所有user的爬取起始时间,非常不灵活。如果你要爬多个用户,并且想单独为每个用户设置一个since_date,可以使用[定期自动爬取微博](#定期自动爬取微博可选)方法二中的方法,该方法可以为多个用户设置不同的since_date,非常灵活。**
+**since_date是所有user的爬取起始时间,非常不灵活。如果你要爬多个用户,并且想单独为每个用户设置一个since_date,可以使用[定期自动爬取微博](https://github.com/dataabc/weiboSpider/blob/master/docs/automation.md)方法二中的方法,该方法可以为多个用户设置不同的since_date,非常灵活。**
**设置write_mode**
write_mode控制结果文件格式,取值范围是csv、txt、json、mongo和mysql,分别代表将结果文件写入csv、txt、json、MongoDB和MySQL数据库。write_mode可以同时包含这些取值中的一个或几个,如: ``` "write_mode": ["csv", "txt"], ``` -代表将结果信息写入csv文件和txt文件。特别注意,如果你想写入数据库,除了在write_mode添加对应数据库的名字外,还应该安装相关数据库和对应python模块,具体操作见[设置数据库](#3设置数据库可选)部分。
+代表将结果信息写入csv文件和txt文件。特别注意,如果你想写入数据库,除了在write_mode添加对应数据库的名字外,还应该安装相关数据库和对应python模块,具体操作见[设置数据库](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md#设置数据库可选)部分。
**设置pic_download**
pic_download控制是否下载微博中的图片,值为1代表下载,值为0代表不下载,如 ``` @@ -76,7 +76,7 @@ video_download控制是否下载微博中的视频,值为1代表下载,值 ``` 代表下载微博中的视频。
**设置cookie**
-请按照[如何获取cookie](#如何获取cookie),获取cookie,然后将“your cookie”替换成真实的cookie值。
+请按照[如何获取cookie](https://github.com/dataabc/weiboSpider/blob/master/docs/cookie.md),获取cookie,然后将“your cookie”替换成真实的cookie值。
**设置mysql_config(可选)**
mysql_config控制mysql参数配置。如果你不需要将结果信息写入mysql,这个参数可以忽略,即删除或保留都无所谓;如果你需要写入mysql且config.json文件中mysql_config的配置与你的mysql配置不一样,请将该值改成你自己mysql中的参数配置。 From 42f4dafca42bf43073d0d18e91ee33ef4bb00170 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 9 Jun 2020 21:45:07 +0800 Subject: [PATCH 189/363] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a6f3e927..440af520 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Weibo Spider -本程序可以连续爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入**文件**或**数据库**。写入信息几乎包括了用户微博的所有数据,主要有**用户信息**和**微博信息**两大类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[输出](#输出)部分。
+本程序可以连续爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入**文件**或**数据库**。写入信息几乎包括了用户微博的所有数据,主要有**用户信息**和**微博信息**两大类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[获取到的字段](#获取到的字段)部分。
具体的写入文件类型如下: - 写入**txt文件**(默认) - 写入**csv文件**(默认) From 4bcf0cdda55882aeccef72ab96b835b60d7453ec Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 10 Jun 2020 01:05:43 +0800 Subject: [PATCH 190/363] Update CONTRIBUTING.md --- CONTRIBUTING.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4661f48c..a2c6f347 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,3 +1,6 @@ +# 为本项目做贡献 +本项目使用**Python3**编写,感谢大家对项目的支持,也欢迎大家为开源项目做贡献。鉴于大家拥有不同的技能、经验、认知、时间等,每个人可以根据自身的情况为本项目贡献力量。我们不会因为贡献者写的代码少或者提的建议不好而失去感恩之心,每一个乐于奉献的人都值得并且应该被尊重。所以,如果您觉得自己的代码或建议不好,而不好意思去贡献,这样可能就让本项目失去了一次变得更好的机会。所以,如果您有好的想法、建议,或者发现了bug,欢迎通过issue提出来,这也是一种贡献方式。如果您想要为本项目贡献代码,我们也非常欢迎。最开始您可以通过pull request方式提交代码,如果我们发现您的代码质量非常高,或者非常有想法等,我们会邀请您请成为本项目的协作者([Collaborator](https://help.github.com/cn/github/setting-up-and-managing-your-github-user-account/permission-levels-for-a-user-account-repository#collaborator-access-on-a-repository-owned-by-a-user-account)),这样您就可以直接向本项目提交代码了。在您贡献代码之前,请先阅读下面的说明,这会让您更好的贡献代码。 + ## 贡献代码之前 如果要开发新功能或者其它需要大量编写代码的修改,在开发之前最好发Issue说明一下。比如,“我准备开发xx新功能”或者“我想修改xx功能”之类的。因为要开发的功能不一定适合本项目,所以提前说明讨论,判断新功能或修改是否有必要。否则,费时费力写了很多代码,结果最后没有被采纳,可能会做一些无用功。 ## Python风格规范(建议Python新手阅读) From c4ff63d666737716e56b41d8f87749ec1d281904 Mon Sep 17 00:00:00 2001 From: dataabc Date: Wed, 10 Jun 2020 19:12:56 +0800 Subject: [PATCH 191/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E5=90=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/__main__.py | 2 +- weibo_spider/{weiboSpider.py => weibo_spider.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename weibo_spider/{weiboSpider.py => weibo_spider.py} (100%) diff --git a/weibo_spider/__main__.py b/weibo_spider/__main__.py index fb8340a6..e5808ef6 100644 --- a/weibo_spider/__main__.py +++ b/weibo_spider/__main__.py @@ -1,5 +1,5 @@ from absl import app -from .weiboSpider import main +from .weibo_spider import main app.run(main) diff --git a/weibo_spider/weiboSpider.py b/weibo_spider/weibo_spider.py similarity index 100% rename from weibo_spider/weiboSpider.py rename to weibo_spider/weibo_spider.py From a95f421c4444b5ceafa8d1968683814a4a6b3767 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Tue, 9 Jun 2020 23:30:45 +0800 Subject: [PATCH 192/363] Update requirements.txt The current version of lxml is outdated and needs a update to fix some install error. --- requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 806d086b..6092b299 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -lxml==4.3.4 -requests==2.22.0 -tqdm==4.32.2 +lxml==4.5.1 +requests==2.23.0 +tqdm==4.46.1 absl-py==0.9.0 \ No newline at end of file From 86efdea05494b74506415b85ec0f8177bf13e43a Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Wed, 10 Jun 2020 22:50:30 +0800 Subject: [PATCH 193/363] Code factor for writer module. Related: #160 --- weibo_spider/weibo_spider.py | 4 +-- weibo_spider/writer/csv_writer.py | 14 ++++---- weibo_spider/writer/json_writer.py | 3 +- weibo_spider/writer/mysql_writer.py | 12 +++---- weibo_spider/writer/txt_writer.py | 56 ++++++++++++++++------------- weibo_spider/writer/writer.py | 12 +++++-- 6 files changed, 57 insertions(+), 44 deletions(-) diff --git a/weibo_spider/weibo_spider.py b/weibo_spider/weibo_spider.py index 3b23b7c8..07979621 100644 --- a/weibo_spider/weibo_spider.py +++ b/weibo_spider/weibo_spider.py @@ -159,12 +159,12 @@ def initialize_info(self, user_config): from .writer import CsvWriter self.writers.append( - CsvWriter(self.filter, self._get_filepath("csv"))) + CsvWriter(self._get_filepath("csv"), self.filter)) if "txt" in self.write_mode: from .writer import TxtWriter self.writers.append( - TxtWriter(self.filter, self._get_filepath("txt"))) + TxtWriter(self._get_filepath("txt"), self.filter)) if "json" in self.write_mode: from .writer import JsonWriter diff --git a/weibo_spider/writer/csv_writer.py b/weibo_spider/writer/csv_writer.py index 62f4528d..ecb879fa 100644 --- a/weibo_spider/writer/csv_writer.py +++ b/weibo_spider/writer/csv_writer.py @@ -5,12 +5,8 @@ class CsvWriter(Writer): - def __init__(self, filter, file_path): + def __init__(self, file_path, filter): self.file_path = file_path - self.filter = filter - - def write_user(self, user): - self.user = user result_headers = [ "微博id", @@ -25,7 +21,7 @@ def write_user(self, user): "转发数", "评论数", ] - if not self.filter: + if not filter: result_headers.insert(4, "被转发微博原始图片url") result_headers.insert(5, "是否为原创微博") try: @@ -37,6 +33,9 @@ def write_user(self, user): print("Error: ", e) traceback.print_exc() + def write_user(self, user): + self.user = user + def write_weibo(self, weibos): """将爬取的信息写入csv文件""" try: @@ -45,8 +44,7 @@ def write_weibo(self, weibos): newline="") as f: writer = csv.writer(f) writer.writerows(result_data) - print(u"%d条微博写入csv文件完毕,保存路径:" % len(weibos)) - print(self.file_path) + print(u"%d条微博写入csv文件完毕,保存路径:%s" % (len(weibos), self.file_path)) except Exception as e: print("Error: ", e) traceback.print_exc() diff --git a/weibo_spider/writer/json_writer.py b/weibo_spider/writer/json_writer.py index 7b9159e5..3f438798 100644 --- a/weibo_spider/writer/json_writer.py +++ b/weibo_spider/writer/json_writer.py @@ -46,5 +46,4 @@ def write_weibo(self, weibos): data = self._update_json_data(data, weibos) with codecs.open(self.file_path, "w", encoding="utf-8") as f: f.write(json.dumps(data, indent=4, ensure_ascii=False)) - print(u"%d条微博写入json文件完毕,保存路径:" % len(weibos)) - print(self.file_path) + print(u"%d条微博写入json文件完毕,保存路径:%s" % (len(weibos), self.file_path)) diff --git a/weibo_spider/writer/mysql_writer.py b/weibo_spider/writer/mysql_writer.py index d4fb84bc..56bd4909 100644 --- a/weibo_spider/writer/mysql_writer.py +++ b/weibo_spider/writer/mysql_writer.py @@ -9,6 +9,12 @@ class MySqlWriter(Writer): def __init__(self, mysql_config): self.mysql_config = mysql_config + # 创建'weibo'数据库 + create_database = """CREATE DATABASE IF NOT EXISTS weibo DEFAULT + CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci""" + self._mysql_create_database(create_database) + self.mysql_config["db"] = "weibo" + def _mysql_create(self, connection, sql): """创建MySQL数据库或表""" try: @@ -33,7 +39,6 @@ def _mysql_create_database(self, sql): def _mysql_create_table(self, sql): """创建MySQL表""" import pymysql - self.mysql_config["db"] = "weibo" connection = pymysql.connect(**self.mysql_config) self._mysql_create(connection, sql) @@ -43,7 +48,6 @@ def _mysql_insert(self, table, data_list): if len(data_list) > 0: keys = ", ".join(data_list[0].keys()) values = ", ".join(["%s"] * len(data_list[0])) - self.mysql_config["db"] = "weibo" connection = pymysql.connect(**self.mysql_config) cursor = connection.cursor() sql = """INSERT INTO {table}({keys}) VALUES ({values}) ON @@ -101,10 +105,6 @@ def write_user(self, user): """将爬取的用户信息写入MySQL数据库""" self.user = user - # 创建'weibo'数据库 - create_database = """CREATE DATABASE IF NOT EXISTS weibo DEFAULT - CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci""" - self._mysql_create_database(create_database) # 创建'user'表 create_table = """ CREATE TABLE IF NOT EXISTS user ( diff --git a/weibo_spider/writer/txt_writer.py b/weibo_spider/writer/txt_writer.py index d05bb101..24dbc6b9 100644 --- a/weibo_spider/writer/txt_writer.py +++ b/weibo_spider/writer/txt_writer.py @@ -5,42 +5,50 @@ class TxtWriter(Writer): - def __init__(self, filter, file_path): - self.filter = filter + def __init__(self, file_path, filter): self.file_path = file_path + self.user_header = u"用户信息" + self.user_desc = [("nickname", "用户昵称"), ("id", "用户id"), + ("weibo_num", "微博数"), ("following", "关注数"), + ("followers", "粉丝数")] + + if filter: + self.weibo_header = u"原创微博内容" + else: + self.weibo_header = u"微博内容" + self.weibo_desc = [("publish_place", "微博位置"), ("publish_time", "发布时间"), + ("up_num", "点赞数"), ("retweet_num", "转发数"), + ("comment_num", "评论数"), ("publish_tool", "发布工具")] + def write_user(self, user): self.user = user - if self.filter: - result_header = u"\n\n原创微博内容: \n" - else: - result_header = u"\n\n微博内容: \n" - result_header = (u"用户信息\n用户昵称:" + self.user["nickname"] + u"\n用户id: " + - str(self.user["id"]) + u"\n微博数: " + - str(self.user["weibo_num"]) + u"\n关注数: " + - str(self.user["following"]) + u"\n粉丝数: " + - str(self.user["followers"]) + result_header) + user_info = "\n".join( + [v + ":" + str(self.user[k]) for k, v in self.user_desc]) with open(self.file_path, "ab") as f: - f.write(result_header.encode(sys.stdout.encoding)) + f.write((self.user_header + ":\n" + user_info + "\n\n").encode( + sys.stdout.encoding)) + print(u"%s信息写入txt文件完毕,保存路径:%s" % (user["nickname"], self.file_path)) def write_weibo(self, weibo): """将爬取的信息写入txt文件""" + + weibo_header = "" + if self.weibo_header: + weibo_header = self.weibo_header + ":\n" + self.weibo_header = "" + try: temp_result = [] - for i, w in enumerate(weibo): - temp_result.append(w["content"] + "\n" + u"微博位置: " + - w["publish_place"] + "\n" + u"发布时间: " + - w["publish_time"] + "\n" + u"点赞数: " + - str(w["up_num"]) + u" 转发数: " + - str(w["retweet_num"]) + u" 评论数: " + - str(w["comment_num"]) + "\n" + u"发布工具: " + - w["publish_tool"] + "\n\n") - result = "".join(temp_result) + for w in weibo: + temp_result.append(w["content"] + "\n" + "\n".join( + [v + ":" + str(w[k]) for k, v in self.weibo_desc])) + result = "\n\n".join(temp_result) + "\n\n" + with open(self.file_path, "ab") as f: - f.write(result.encode(sys.stdout.encoding)) - print(u"%d条微博写入txt文件完毕,保存路径:" % len(weibo)) - print(self.file_path) + f.write((weibo_header + result).encode(sys.stdout.encoding)) + print(u"%d条微博写入txt文件完毕,保存路径:%s" % (len(weibo), self.file_path)) except Exception as e: print("Error: ", e) traceback.print_exc() diff --git a/weibo_spider/writer/writer.py b/weibo_spider/writer/writer.py index c05846a2..45366510 100644 --- a/weibo_spider/writer/writer.py +++ b/weibo_spider/writer/writer.py @@ -1,9 +1,17 @@ -class Writer: - def __init__(self, config): +from abc import ABC, abstractmethod + + +class Writer(ABC): + def __init__(self): + """根据需要,初始化结果路径、初始化表头、初始化数据库等""" pass + @abstractmethod def write_weibo(self, weibo): + """给定微博信息,写入对应文本或数据库""" pass + @abstractmethod def write_user(self, user): + """给定用户信息,写入对应文本或数据库""" pass From 78abbe97eec3a5e2eaff9930930cd529ac513b4d Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Thu, 11 Jun 2020 00:23:50 +0800 Subject: [PATCH 194/363] Code refactor for downloader module. Related: #160 --- weibo_spider/downloader/downloader.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/weibo_spider/downloader/downloader.py b/weibo_spider/downloader/downloader.py index 56331567..38f8f081 100644 --- a/weibo_spider/downloader/downloader.py +++ b/weibo_spider/downloader/downloader.py @@ -3,12 +3,13 @@ import sys import traceback +from abc import ABC, abstractmethod import requests from requests.adapters import HTTPAdapter from tqdm import tqdm -class Downloader: +class Downloader(ABC): def __init__(self, file_dir): self.file_dir = file_dir @@ -16,6 +17,11 @@ def __init__(self, file_dir): self.describe = u"" self.key = "" + @abstractmethod + def handle_download(self, urls, w): + """下载 urls 里所指向的图片或视频文件,使用 w 里的信息来生成文件名""" + pass + def get_filepath(self): """获取结果文件路径""" try: @@ -44,9 +50,6 @@ def download_one_file(self, url, file_path, weibo_id): print("Error: ", e) traceback.print_exc() - def handle_download(self): - pass - def download_files(self, weibos): """下载文件(图片/视频)""" try: From 08dee30c9a63f638a4cc54657042288ea25c47ee Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Thu, 11 Jun 2020 00:52:40 +0800 Subject: [PATCH 195/363] [#160] Code refactor for module parser. --- weibo_spider/parser/mblog_picAll_parser.py | 14 ++++++++++++++ weibo_spider/parser/page_parser.py | 5 +++-- 2 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 weibo_spider/parser/mblog_picAll_parser.py diff --git a/weibo_spider/parser/mblog_picAll_parser.py b/weibo_spider/parser/mblog_picAll_parser.py new file mode 100644 index 00000000..d4e7381a --- /dev/null +++ b/weibo_spider/parser/mblog_picAll_parser.py @@ -0,0 +1,14 @@ + + +from .parser import Parser +from .util import handle_html + + +class MblogPicAllParser(Parser): + def __init__(self, cookie, weibo_id): + self.cookie = cookie + self.url = "https://weibo.cn/mblog/picAll/" + weibo_id + "?rl=1" + self.selector = handle_html(self.cookie, self.url) + + def extract_preview_picture_list(self): + return self.selector.xpath("//img/@src") diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index 81b7a72f..aaeb551a 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -9,6 +9,7 @@ from .. import datetime_util, printer from .comment_parser import CommentParser from .parser import Parser +from .mblog_picAll_parser import MblogPicAllParser from .util import handle_garbled, handle_html @@ -328,8 +329,8 @@ def extract_picture_urls(self, info, weibo_id): picture_urls = u"无" if first_pic in a_list: if all_pic in a_list: - selector = handle_html(self.cookie, all_pic) - preview_picture_list = selector.xpath("//img/@src") + preview_picture_list = MblogPicAllParser( + self.cookie, weibo_id).extract_preview_picture_list() picture_list = [ p.replace("/thumb180/", "/large/") for p in preview_picture_list From 23a33c73b2d6c9723dda2cc969910ae0f57c3761 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Thu, 11 Jun 2020 01:56:55 +0800 Subject: [PATCH 196/363] Update issue templates --- .github/ISSUE_TEMPLATE/------.md | 28 ++++++++++++++++++++++++++++ .github/ISSUE_TEMPLATE/----.md | 10 ++++++++++ .github/ISSUE_TEMPLATE/--.md | 10 ++++++++++ .github/ISSUE_TEMPLATE/bug--.md | 30 ++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/------.md create mode 100644 .github/ISSUE_TEMPLATE/----.md create mode 100644 .github/ISSUE_TEMPLATE/--.md create mode 100644 .github/ISSUE_TEMPLATE/bug--.md diff --git a/.github/ISSUE_TEMPLATE/------.md b/.github/ISSUE_TEMPLATE/------.md new file mode 100644 index 00000000..779d3fb7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/------.md @@ -0,0 +1,28 @@ +--- +name: 程序运行出错 +about: 运行出错,需要帮助 +title: '' +labels: failed +assignees: '' + +--- + +为了更好的解决问题,请认真回答下面的问题。等到问题解决,请及时关闭本issue。
+ +- 问:请您指明哪个版本运行出错(github版/PyPi版/全部)?
+答: + +- 问:您使用的是否是最新的程序(是/否)?
+答: + +- 问:爬取任意用户都会运行出错吗(是/否)?
+答: + +- 问:若只有爬特定微博时才出错,能否提供出错微博的weibo_id或url(非必填)?
+答: + +- 问:若您已提供出错微博的weibo_id或url,可忽略此内容,否则能否提供出错账号的**user_id**(非必填)?
+答: + +- 问:如果方便,请您描述出错详情,最好附上错误提示。
+答: diff --git a/.github/ISSUE_TEMPLATE/----.md b/.github/ISSUE_TEMPLATE/----.md new file mode 100644 index 00000000..a1b433f8 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/----.md @@ -0,0 +1,10 @@ +--- +name: 其它问题 +about: 其它想要讨论的问题 +title: '' +labels: '' +assignees: '' + +--- + + diff --git a/.github/ISSUE_TEMPLATE/--.md b/.github/ISSUE_TEMPLATE/--.md new file mode 100644 index 00000000..503d93e9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/--.md @@ -0,0 +1,10 @@ +--- +name: 其它 +about: 其它 +title: '' +labels: '' +assignees: '' + +--- + + diff --git a/.github/ISSUE_TEMPLATE/bug--.md b/.github/ISSUE_TEMPLATE/bug--.md new file mode 100644 index 00000000..1e8b9ee7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug--.md @@ -0,0 +1,30 @@ +--- +name: Bug报修 +about: 向程序开发者申报bug +title: '' +labels: bug +assignees: '' + +--- + +感谢您申报bug,为了表示感谢,如果bug确实存在,您将出现在本项目的贡献者列表里;如果您不但发现了bug,还提供了很好的解决方案,我们会邀请您以pull request的方式成为本项目的代码贡献者(Contributor);如果您多次提供很好的pull request,我们将邀请您成为本项目的协助者(Collaborator)。当然,是否提供解决方按都是自愿的。不管是否是真正的bug、是否提供解决方案,我们都感谢您对本项目的帮助。
+
+ + +- 问:请您指明哪个版本出了bug(github版/PyPi版/全部)?
+答: + +- 问:您使用的是否是最新的程序(是/否)?
+答: + +- 问:爬取任意用户都会复现此bug吗(是/否)?
+答: + +- 问:若只有爬特定微博时才出bug,能否提供出错微博的weibo_id或url(非必填)?
+答: + +- 问:若您已提供出错微博的weibo_id或url,可忽略此内容,否则能否提供出错账号的**user_id**(非必填)?
+答: + +- 问:如果方便,请您描述bug详情,如果代码报错,最好附上错误提示。
+答: From ba97352827efc8f909807945239b657519a80b69 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Thu, 11 Jun 2020 01:58:05 +0800 Subject: [PATCH 197/363] Rename bug--.md to bug-report.md --- .github/ISSUE_TEMPLATE/{bug--.md => bug-report.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/ISSUE_TEMPLATE/{bug--.md => bug-report.md} (100%) diff --git a/.github/ISSUE_TEMPLATE/bug--.md b/.github/ISSUE_TEMPLATE/bug-report.md similarity index 100% rename from .github/ISSUE_TEMPLATE/bug--.md rename to .github/ISSUE_TEMPLATE/bug-report.md From a0143bcf17295e4b40bd0665a119dbe1a269c165 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Thu, 11 Jun 2020 01:59:29 +0800 Subject: [PATCH 198/363] Update and rename --.md to other.md --- .github/ISSUE_TEMPLATE/--.md | 10 ---------- .github/ISSUE_TEMPLATE/other.md | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) delete mode 100644 .github/ISSUE_TEMPLATE/--.md create mode 100644 .github/ISSUE_TEMPLATE/other.md diff --git a/.github/ISSUE_TEMPLATE/--.md b/.github/ISSUE_TEMPLATE/--.md deleted file mode 100644 index 503d93e9..00000000 --- a/.github/ISSUE_TEMPLATE/--.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -name: 其它 -about: 其它 -title: '' -labels: '' -assignees: '' - ---- - - diff --git a/.github/ISSUE_TEMPLATE/other.md b/.github/ISSUE_TEMPLATE/other.md new file mode 100644 index 00000000..c8d4e001 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/other.md @@ -0,0 +1,10 @@ +--- +name: 其它问题 +about: 其它想讨论的问题 +title: '' +labels: '' +assignees: '' + +--- + + From 57a8bd3ad4865dd7562cfea89a1a84465eb3a908 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Thu, 11 Jun 2020 02:00:15 +0800 Subject: [PATCH 199/363] Rename other.md to others.md --- .github/ISSUE_TEMPLATE/{other.md => others.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/ISSUE_TEMPLATE/{other.md => others.md} (100%) diff --git a/.github/ISSUE_TEMPLATE/other.md b/.github/ISSUE_TEMPLATE/others.md similarity index 100% rename from .github/ISSUE_TEMPLATE/other.md rename to .github/ISSUE_TEMPLATE/others.md From 478fe1b42ed3335b8d1f0f2ba3bead0a78fa2d7e Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Thu, 11 Jun 2020 02:00:51 +0800 Subject: [PATCH 200/363] Rename others.md to other-question.md --- .github/ISSUE_TEMPLATE/{others.md => other-question.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/ISSUE_TEMPLATE/{others.md => other-question.md} (100%) diff --git a/.github/ISSUE_TEMPLATE/others.md b/.github/ISSUE_TEMPLATE/other-question.md similarity index 100% rename from .github/ISSUE_TEMPLATE/others.md rename to .github/ISSUE_TEMPLATE/other-question.md From 9fd78c478b2f18124726f3521930ce60d4267331 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Thu, 11 Jun 2020 02:03:46 +0800 Subject: [PATCH 201/363] Update and rename ----.md to feature-request.md --- .github/ISSUE_TEMPLATE/----.md | 10 ---------- .github/ISSUE_TEMPLATE/feature-request.md | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 10 deletions(-) delete mode 100644 .github/ISSUE_TEMPLATE/----.md create mode 100644 .github/ISSUE_TEMPLATE/feature-request.md diff --git a/.github/ISSUE_TEMPLATE/----.md b/.github/ISSUE_TEMPLATE/----.md deleted file mode 100644 index a1b433f8..00000000 --- a/.github/ISSUE_TEMPLATE/----.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -name: 其它问题 -about: 其它想要讨论的问题 -title: '' -labels: '' -assignees: '' - ---- - - diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md new file mode 100644 index 00000000..65870218 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature-request.md @@ -0,0 +1,15 @@ +--- +name: 新需求或建议 +about: 建议开发新功能,或虽然没有新需求但对本项目有其它建议 +title: '' +labels: 'feature' +assignees: '' + +--- + +- 问:请说明需要什么新功能。
+答: + +- 问:请说明添加该功能的意义。(非必填)
+答: + From 66eaa57b987bd625c6f155b6a055d1b6d2826ffc Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Thu, 11 Jun 2020 02:04:43 +0800 Subject: [PATCH 202/363] Rename other-question.md to other.md --- .github/ISSUE_TEMPLATE/{other-question.md => other.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/ISSUE_TEMPLATE/{other-question.md => other.md} (100%) diff --git a/.github/ISSUE_TEMPLATE/other-question.md b/.github/ISSUE_TEMPLATE/other.md similarity index 100% rename from .github/ISSUE_TEMPLATE/other-question.md rename to .github/ISSUE_TEMPLATE/other.md From a60216caab88700b1b49fcdd0a6826bc3ab46d5b Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Thu, 11 Jun 2020 02:05:47 +0800 Subject: [PATCH 203/363] Rename ------.md to failed.md --- .github/ISSUE_TEMPLATE/{------.md => failed.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/ISSUE_TEMPLATE/{------.md => failed.md} (100%) diff --git a/.github/ISSUE_TEMPLATE/------.md b/.github/ISSUE_TEMPLATE/failed.md similarity index 100% rename from .github/ISSUE_TEMPLATE/------.md rename to .github/ISSUE_TEMPLATE/failed.md From 0f51c22df511f195ef58a2e98102578d6b4d07e8 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Thu, 11 Jun 2020 11:56:27 +0800 Subject: [PATCH 204/363] Update README.md --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 440af520..53be2f38 100644 --- a/README.md +++ b/README.md @@ -13,14 +13,14 @@ - 下载用户**原创**微博**Live Photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) - 下载用户**转发**微博**Live Photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有)
-当然,如果你只对用户信息感兴趣,而不需要爬用户的微博,也可以通过设置实现只爬取微博用户信息的功能。
-程序也可以实现**爬取结果自动更新**,即:现在爬取了目标用户的微博,几天之后,目标用户可能又发新微博了。通过设置,可以实现每隔几天**增量爬取**用户这几天发的新微博。具体方法见[定期自动爬取微博](#定期自动爬取微博可选)。
-本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需[免cookie版](https://github.com/dataabc/weibo-crawler),大家可以访问,二者功能类似,免cookie版获取的信息更多,用法更简单,而且不需要cookie。
+如果你只对用户信息感兴趣,而不需要爬用户的微博,也可以通过设置实现只爬取微博用户信息的功能。
+本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需[免cookie版](https://github.com/dataabc/weibo-crawler),可以访问,二者功能类似。
* [获取到的字段](#获取到的字段) * [实例](#实例) * [运行环境](#运行环境) * [使用说明](#使用说明) +* [个性化定制程序(可选)](#个性化定制程序可选) * [定期自动爬取微博(可选)](#定期自动爬取微博可选) * [如何获取cookie](#如何获取cookie) * [如何获取user_id](#如何获取user_id) @@ -111,9 +111,9 @@ $ python3 -m weibo_spider ```bash $ python3 -m weibo_spider --config_path="config.json" ``` -### 4.按需求修改脚本(可选) -本部分为可选部分,如果你不需要自己修改代码或添加新功能,可以忽略此部分。
-本程序所有代码都位于weiboSpider.py文件,程序主体是一个Weibo类,上述所有功能都是通过在main函数调用Weibo类实现的,默认的调用代码如下: +## 个性化定制程序(可选) +本部分为可选部分,如果不需要个性化定制程序或添加新功能,可以忽略此部分。
+本程序主体代码位于weibo_spider.py文件,程序主体是一个Weibo类,上述所有功能都是通过在main函数调用Weibo类实现的,默认的调用代码如下: ```python config = get_config() wb = Weibo(config) @@ -174,4 +174,4 @@ $ python3 -m weibo_spider --config_path="config.json" ## 注意事项 1.user_id不能为爬虫微博的user_id。因为要爬微博信息,必须先登录到某个微博账号,此账号我们姑且称为爬虫微博。爬虫微博访问自己的页面和访问其他用户的页面,得到的网页格式不同,所以无法爬取自己的微博信息;如果想要爬取爬虫微博内容,可以参考[获取自身微博信息](https://github.com/dataabc/weiboSpider/issues/113);
-2.cookie有期限限制,超过有效期需重新更新cookie。 +2.cookie有期限限制,若提示cookie错误或已过期,需要重新更新cookie。 From 6dd7cfb816f4a9194ff0773df46d7baa5d9ff664 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Thu, 11 Jun 2020 18:28:16 +0800 Subject: [PATCH 205/363] Update README.md --- README.md | 141 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 80 insertions(+), 61 deletions(-) diff --git a/README.md b/README.md index 53be2f38..eefba38d 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # Weibo Spider -本程序可以连续爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入**文件**或**数据库**。写入信息几乎包括了用户微博的所有数据,主要有**用户信息**和**微博信息**两大类,前者包含用户昵称、关注数、粉丝数、微博数等等;后者包含微博正文、发布时间、发布工具、评论数等等,因为内容太多,这里不再赘述,详细内容见[获取到的字段](#获取到的字段)部分。
+ +本程序可以连续爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入**文件**或**数据库**。写入信息几乎包括用户微博的所有数据,包括**用户信息**和**微博信息**两大类。因为内容太多,这里不再赘述,详细内容见[获取到的字段](#获取到的字段)。如果只需要用户信息,可以通过设置实现只爬取微博用户信息的功能。本程序需设置cookie来获取微博访问权限,后面会讲解[如何获取cookie](#如何获取cookie)。如果不想设置cookie,可以使用[免cookie版](https://github.com/dataabc/weibo-crawler),二者功能类似。 + 具体的写入文件类型如下: - 写入**txt文件**(默认) - 写入**csv文件**(默认) @@ -11,24 +13,25 @@ - 下载用户**原创**微博中的**视频**(可选) - 下载用户**转发**微博中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) - 下载用户**原创**微博**Live Photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) -- 下载用户**转发**微博**Live Photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有)
- -如果你只对用户信息感兴趣,而不需要爬用户的微博,也可以通过设置实现只爬取微博用户信息的功能。
-本程序需要设置用户cookie,以获取微博访问权限,后面会讲解如何获取cookie。如需[免cookie版](https://github.com/dataabc/weibo-crawler),可以访问,二者功能类似。
- -* [获取到的字段](#获取到的字段) -* [实例](#实例) -* [运行环境](#运行环境) -* [使用说明](#使用说明) -* [个性化定制程序(可选)](#个性化定制程序可选) -* [定期自动爬取微博(可选)](#定期自动爬取微博可选) -* [如何获取cookie](#如何获取cookie) -* [如何获取user_id](#如何获取user_id) -* [相关项目](#相关项目) -* [注意事项](#注意事项) +- 下载用户**转发**微博**Live Photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) + +## 内容列表 + +- [获取到的字段](#获取到的字段) +- [示例](#示例) +- [运行环境](#运行环境) +- [使用说明](#使用说明) +- [个性化定制程序(可选)](#个性化定制程序可选) +- [定期自动爬取微博(可选)](#定期自动爬取微博可选) +- [如何获取cookie](#如何获取cookie) +- [如何获取user_id](#如何获取user_id) +- [相关项目](#相关项目) +- [注意事项](#注意事项) ## 获取到的字段 -本部分为爬取到的字段信息说明,为了与[免cookie版](https://github.com/dataabc/weibo-crawler)区分,下面将两者爬取到的信息都列出来。如果是免cookie版所特有的信息,会有免cookie标注,没有标注的为二者共有的信息。
+ +本部分为爬取到的字段信息说明,为了与[免cookie版](https://github.com/dataabc/weibo-crawler)区分,下面将两者爬取到的信息都列出来。如果是免cookie版所特有的信息,会有免cookie标注,没有标注的为二者共有的信息。 + **用户信息** - 用户id:微博用户id,如"1669879400",其实这个字段本来就是已知字段 - 昵称:用户昵称,如"Dear-迪丽热巴" @@ -71,39 +74,47 @@ - 话题(免cookie版):微博话题,即两个#中的内容,若存在多个话题,每个url以英文逗号分隔,若没有则值为'' - @用户(免cookie版):微博@的用户,若存在多个@用户,每个url以英文逗号分隔,若没有则值为'' - 原始微博(免cookie版):为转发微博所特有,是转发微博中那条被转发的微博,存储为字典形式,包含了上述微博信息中的所有内容,如微博id、微博内容等等 -
-## 实例 -如果想要知道程序的具体运行结果,可以查看[实例文档](https://github.com/dataabc/weiboSpider/blob/master/docs/example.md),该文档介绍了爬取[迪丽热巴微博](https://weibo.cn/u/1669879400)的例子,并附有部分结果文件截图。 +## 示例 + +如果想要知道程序的具体运行结果,可以查看[示例文档](https://github.com/dataabc/weiboSpider/blob/master/docs/example.md),该文档介绍了爬取[迪丽热巴微博](https://weibo.cn/u/1669879400)的例子,并附有部分结果文件截图。 ## 运行环境 + - 开发语言:python2/python3 - 系统: Windows/Linux/macOS ## 使用说明 + ### 0.版本 -本程序有两个版本,你现在看到的是python3版,另一个是python2版,python2版位于[python2分支](https://github.com/dataabc/weiboSpider/tree/python2)。目前主力开发python3版,包括新功能开发和bug修复;python2版仅支持bug修复。推荐python3用户使用当前版本,推荐python2用户使用[python2版](https://github.com/dataabc/weiboSpider/tree/python2),本使用说明是python3版的使用说明。
-### 1.下载脚本 -本程序提供两种下载方式,一种是**源码下载安装**,另一种是**pip安装**,二者功能完全相同。如果你需要修改源码,建议使用第一种方式,否则选哪种安装方式都可以。
-#### 源码下载安装 -下载脚本 -```bash -$ git clone https://github.com/dataabc/weibospider.git -``` -安装依赖 + +本程序有两个版本,你现在看到的是python3版,另一个是python2版,python2版位于[python2分支](https://github.com/dataabc/weiboSpider/tree/python2)。目前主力开发python3版,包括新功能开发和bug修复;python2版仅支持bug修复。推荐python3用户使用当前版本,推荐python2用户使用[python2版](https://github.com/dataabc/weiboSpider/tree/python2),本使用说明是python3版的使用说明。 + +### 1.安装程序 + +本程序提供两种安装方式,一种是**源码安装**,另一种是**pip安装**,二者功能完全相同。如果你需要修改源码,建议使用第一种方式,否则选哪种安装方式都可以。 + +#### 源码安装 + ```bash +$ git clone https://github.com/dataabc/weiboSpider.git +$ cd weiboSpider $ pip install -r requirements.txt ``` -运行上述命令,将本项目下载到当前目录,如果下载成功当前目录会出现一个名为"weibospider"的文件夹;
+ #### pip安装 + ```bash $ python3 -m pip install weibo-spider ``` + ### 2.程序设置 + 要了解程序设置,请查看[程序设置文档](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md)。 -### 3.运行脚本 -**源码下载安装**的用户可以在weiboSpider目录运行如下命令,**pip安装**的用户可以在任意有写权限的目录运行如下命令 +### 3.运行程序 + +**源码安装**的用户可以在weiboSpider目录运行如下命令,**pip安装**的用户可以在任意有写权限的目录运行如下命令 ```bash $ python3 -m weibo_spider ``` @@ -111,32 +122,35 @@ $ python3 -m weibo_spider ```bash $ python3 -m weibo_spider --config_path="config.json" ``` + ## 个性化定制程序(可选) -本部分为可选部分,如果不需要个性化定制程序或添加新功能,可以忽略此部分。
+ +本部分为可选部分,如果不需要个性化定制程序或添加新功能,可以忽略此部分。 + 本程序主体代码位于weibo_spider.py文件,程序主体是一个Weibo类,上述所有功能都是通过在main函数调用Weibo类实现的,默认的调用代码如下: ```python config = get_config() wb = Weibo(config) wb.start() # 爬取微博信息 ``` -用户可以按照自己的需求调用或修改Weibo类。
-通过执行本程序,我们可以得到很多信息
+用户可以按照自己的需求调用或修改Weibo类。通过执行本程序,我们可以得到很多信息。 +
点击查看详情 -**wb.user['nickname']**:用户昵称;
-**wb.user['gender']**:用户性别;
-**wb.user['location']**:用户所在地;
-**wb.user['birthday']**:用户出生日期;
-**wb.user['description']**:用户简介;
-**wb.user['verified_reason']**:用户认证;
-**wb.user['talent']**:用户标签;
-**wb.user['education']**:用户学习经历;
-**wb.user['work']**:用户工作经历;
-**wb.user['weibo_num']**:微博数;
-**wb.user['following']**:关注数;
-**wb.user['followers']**:粉丝数;
+- wb.user['nickname']:用户昵称; +- wb.user['gender']:用户性别; +- wb.user['location']:用户所在地; +- wb.user['birthday']:用户出生日期; +- wb.user['description']:用户简介; +- wb.user['verified_reason']:用户认证; +- wb.user['talent']:用户标签; +- wb.user['education']:用户学习经历; +- wb.user['work']:用户工作经历; +- wb.user['weibo_num']:微博数; +- wb.user['following']:关注数; +- wb.user['followers']:粉丝数;
**wb.weibo**:除不包含上述信息外,wb.weibo包含爬取到的所有微博信息,如**微博id**、**微博正文**、**原始图片url**、**发布位置**、**发布时间**、**发布工具**、**点赞数**、**转发数**、**评论数**等。如果爬的是全部微博(原创+转发),除上述信息之外,还包含被**转发微博原始图片url**、**是否为原创微博**等。wb.weibo是一个列表,包含了爬取的所有微博信息。wb.weibo[0]为爬取的第一条微博,wb.weibo[1]为爬取的第二条微博,以此类推。当filter=1时,wb.weibo[0]为爬取的第一条**原创**微博,以此类推。wb.weibo[0]['id']为第一条微博的id,wb.weibo[0]['content']为第一条微博的正文,wb.weibo[0]['publish_time']为第一条微博的发布时间,还有其它很多信息不在赘述,大家可以点击下面的"详情"查看具体用法。 @@ -144,34 +158,39 @@ $ python3 -m weibo_spider --config_path="config.json" 详情 -若目标微博用户存在微博,则:
-**id**:存储微博id。如wb.weibo[0]['id']为最新一条微博的id;
-**content**:存储微博正文。如wb.weibo[0]['content']为最新一条微博的正文;
-**article_url**:存储微博中头条文章的url。如wb.weibo[0]['article_url']为最新一条微博的头条文章url,若微博中不存在头条文章,则值为'';
-**original_pictures**:存储原创微博的原始图片url和转发微博转发理由中的图片url。如wb.weibo[0]['original_pictures']为最新一条微博的原始图片url,若该条微博有多张图片,则存储多个url,以英文逗号分割;若该微博没有图片,则值为"无";
-**retweet_pictures**:存储被转发微博中的原始图片url。当最新微博为原创微博或者为没有图片的转发微博时,则值为"无",否则为被转发微博的图片url。若有多张图片,则存储多个url,以英文逗号分割;
-**publish_place**:存储微博的发布位置。如wb.weibo[0]['publish_place']为最新一条微博的发布位置,如果该条微博没有位置信息,则值为"无";
-**publish_time**:存储微博的发布时间。如wb.weibo[0]['publish_time']为最新一条微博的发布时间;
-**up_num**:存储微博获得的点赞数。如wb.weibo[0]['up_num']为最新一条微博获得的点赞数;
-**retweet_num**:存储微博获得的转发数。如wb.weibo[0]['retweet_num']为最新一条微博获得的转发数;
-**comment_num**:存储微博获得的评论数。如wb.weibo[0]['comment_num']为最新一条微博获得的评论数;
-**publish_tool**:存储微博的发布工具。如wb.weibo[0]['publish_tool']为最新一条微博的发布工具。 +若目标微博用户存在微博,则: +- id:存储微博id。如wb.weibo[0]['id']为最新一条微博的id; +- content:存储微博正文。如wb.weibo[0]['content']为最新一条微博的正文; +- article_url:存储微博中头条文章的url。如wb.weibo[0]['article_url']为最新一条微博的头条文章url,若微博中不存在头条文章,则值为''; +- original_pictures:存储原创微博的原始图片url和转发微博转发理由中的图片url。如wb.weibo[0]['original_pictures']为最新一条微博的原始图片url,若该条微博有多张图片,则存储多个url,以英文逗号分割;若该微博没有图片,则值为"无"; +- retweet_pictures:存储被转发微博中的原始图片url。当最新微博为原创微博或者为没有图片的转发微博时,则值为"无",否则为被转发微博的图片url。若有多张图片,则存储多个url,以英文逗号分割; +- publish_place:存储微博的发布位置。如wb.weibo[0]['publish_place']为最新一条微博的发布位置,如果该条微博没有位置信息,则值为"无"; +- publish_time:存储微博的发布时间。如wb.weibo[0]['publish_time']为最新一条微博的发布时间; +- up_num:存储微博获得的点赞数。如wb.weibo[0]['up_num']为最新一条微博获得的点赞数; +- retweet_num:存储微博获得的转发数。如wb.weibo[0]['retweet_num']为最新一条微博获得的转发数; +- comment_num:存储微博获得的评论数。如wb.weibo[0]['comment_num']为最新一条微博获得的评论数; +- publish_tool:存储微博的发布工具。如wb.weibo[0]['publish_tool']为最新一条微博的发布工具。 ## 定期自动爬取微博(可选) + 要想让程序每个一段时间自动爬取,且爬取的内容为新增加的内容(不包括已经获取的微博),请查看[定期自动爬取微博](https://github.com/dataabc/weiboSpider/blob/master/docs/automation.md)。 ## 如何获取cookie + 要了解获取cookie方法,请查看[cookie文档](https://github.com/dataabc/weiboSpider/blob/master/docs/cookie.md)。 ## 如何获取user_id + 要了解获取user_id方法,请查看[user_id文档](https://github.com/dataabc/weiboSpider/blob/master/docs/userid.md),该文档介绍了如何获取一个及多个微博用户user_id的方法。 ## 相关项目 + - [weibo-crawler](https://github.com/dataabc/weibo-crawler) - 功能和本项目完全一样,可以不添加cookie,获取的微博属性更多; - [weibo-search](https://github.com/dataabc/weibo-search) - 可以连续获取一个或多个**微博关键词搜索**结果,并将结果写入文件(可选)、数据库(可选)等。所谓微博关键词搜索即:**搜索正文中包含指定关键词的微博**,可以指定搜索的时间范围。对于非常热门的关键词,一天的时间范围,可以获得**1000万**以上的搜索结果,N天的时间范围就可以获得1000万 X N搜索结果。对于大多数关键词,一天产生的相应微博数量应该在1000万条以下,因此可以说该程序可以获得大部分关键词的全部或近似全部的搜索结果。而且该程序可以获得搜索结果的所有信息,本程序获得的微博信息该程序都能获得。 ## 注意事项 -1.user_id不能为爬虫微博的user_id。因为要爬微博信息,必须先登录到某个微博账号,此账号我们姑且称为爬虫微博。爬虫微博访问自己的页面和访问其他用户的页面,得到的网页格式不同,所以无法爬取自己的微博信息;如果想要爬取爬虫微博内容,可以参考[获取自身微博信息](https://github.com/dataabc/weiboSpider/issues/113);
-2.cookie有期限限制,若提示cookie错误或已过期,需要重新更新cookie。 + +1.user_id不能为爬虫微博的user_id。因为要爬微博信息,必须先登录到某个微博账号,此账号我们姑且称为爬虫微博。爬虫微博访问自己的页面和访问其他用户的页面,得到的网页格式不同,所以无法爬取自己的微博信息;如果想要爬取爬虫微博内容,可以参考[获取自身微博信息](https://github.com/dataabc/weiboSpider/issues/113); +2.cookie有期限限制,大约三个月。若提示cookie错误或已过期,需要重新更新cookie。 From 01382bc4f81b987353f17ff69108c6b1286bd840 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Thu, 11 Jun 2020 20:09:32 +0800 Subject: [PATCH 206/363] [#160] minor refactor: remove some code in get_user_info. --- weibo_spider/weibo_spider.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/weibo_spider/weibo_spider.py b/weibo_spider/weibo_spider.py index 07979621..f122fdb7 100644 --- a/weibo_spider/weibo_spider.py +++ b/weibo_spider/weibo_spider.py @@ -79,10 +79,7 @@ def write_user(self, user): def get_user_info(self, user_uri): # 获取用户信息、微博数、关注数、粉丝数 - self.user = {} - user = IndexParser(self.cookie, user_uri).get_user() - for k, v in user.items(): - self.user[k] = v + self.user = IndexParser(self.cookie, user_uri).get_user() def get_weibo_info(self): """获取微博信息""" From e3b8d7a39341eb3df711a40522399eeaf1ec3c7d Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sun, 14 Jun 2020 19:12:54 +0800 Subject: [PATCH 207/363] Create stale.yml --- .github/stale.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 .github/stale.yml diff --git a/.github/stale.yml b/.github/stale.yml new file mode 100644 index 00000000..cbbef7f8 --- /dev/null +++ b/.github/stale.yml @@ -0,0 +1,20 @@ +# Number of days of inactivity before an issue becomes stale +daysUntilStale: 60 +# Number of days of inactivity before a stale issue is closed +daysUntilClose: 7 +# Issues with these labels will never be considered stale +exemptLabels: + - pinned + - security + - to do +# Label to use when marking an issue as stale +staleLabel: wontfix +# Comment to post when marking an issue as stale. Set to `false` to disable +markComment: > + This issue has been automatically marked as stale because it has not had + recent activity. It will be closed if no further activity occurs. Thank you + for your contributions. +# Comment to post when closing a stale issue. Set to `false` to disable +closeComment: > + Closing as stale, please reopen if you'd like to work on this further. +only: issues From 1dc3e3a96f95ca18822db8c32ab04c4151433ca8 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Sun, 14 Jun 2020 22:09:32 +0800 Subject: [PATCH 208/363] refactor [#160]: make class User to group user related info. --- weibo_spider/parser/index_parser.py | 20 ++++++++----------- weibo_spider/parser/info_parser.py | 30 +++++++++++++---------------- weibo_spider/printer.py | 9 --------- weibo_spider/user.py | 29 ++++++++++++++++++++++++++++ weibo_spider/weibo_spider.py | 20 +++++++++---------- weibo_spider/writer/json_writer.py | 2 +- weibo_spider/writer/mongo_writer.py | 6 +++--- weibo_spider/writer/mysql_writer.py | 6 +++--- weibo_spider/writer/txt_writer.py | 4 ++-- 9 files changed, 69 insertions(+), 57 deletions(-) create mode 100644 weibo_spider/user.py diff --git a/weibo_spider/parser/index_parser.py b/weibo_spider/parser/index_parser.py index 88a10e9d..db870053 100644 --- a/weibo_spider/parser/index_parser.py +++ b/weibo_spider/parser/index_parser.py @@ -28,19 +28,15 @@ def _get_user_id(self): def get_user(self): """获取用户信息、微博数、关注数、粉丝数""" try: - self.user = {} - self.user["id"] = self._get_user_id() - user = InfoParser(self.cookie, - self.user["id"]).extract_user_info() # 获取用户信息 - for k, v in user.items(): - self.user[k] = v + user_id = self._get_user_id() + self.user = InfoParser(self.cookie, + user_id).extract_user_info() # 获取用户信息 + self.user.id = user_id + user_info = self.selector.xpath("//div[@class='tip2']/*/text()") - weibo_num = int(user_info[0][3:-1]) - following = int(user_info[1][3:-1]) - followers = int(user_info[2][3:-1]) - self.user["weibo_num"] = weibo_num - self.user["following"] = following - self.user["followers"] = followers + self.user.weibo_num = int(user_info[0][3:-1]) + self.user.following = int(user_info[1][3:-1]) + self.user.followers = int(user_info[2][3:-1]) return self.user except Exception as e: print("Error: ", e) diff --git a/weibo_spider/parser/info_parser.py b/weibo_spider/parser/info_parser.py index cd52e941..c0330bff 100644 --- a/weibo_spider/parser/info_parser.py +++ b/weibo_spider/parser/info_parser.py @@ -4,6 +4,8 @@ from .parser import Parser from .util import handle_html +from ..user import User + class InfoParser(Parser): def __init__(self, cookie, user_id): @@ -14,43 +16,37 @@ def __init__(self, cookie, user_id): def extract_user_info(self): """提取用户信息""" try: - user = {} + user = User() nickname = self.selector.xpath("//title/text()")[0] nickname = nickname[:-3] if nickname == u"登录 - 新" or nickname == u"新浪": sys.exit(u"cookie错误或已过期,请按照README中方法重新获取") - user["nickname"] = nickname + user.nickname = nickname + basic_info = self.selector.xpath("//div[@class='c'][3]/text()") zh_list = [u"性别", u"地区", u"生日", u"简介", u"认证", u"达人"] en_list = [ - "gender", - "location", - "birthday", - "description", - "verified_reason", - "talent", - "education", - "work", + "gender", "location", "birthday", "description", + "verified_reason", "talent" ] - for i in en_list: - user[i] = "" for i in basic_info: if i.split(":", 1)[0] in zh_list: - user[en_list[zh_list.index(i.split(":", 1)[0])]] = i.split( - ":", 1)[1].replace("\u3000", "") + setattr(user, en_list[zh_list.index(i.split(":", 1)[0])], + i.split(":", 1)[1].replace("\u3000", "")) + if self.selector.xpath( "//div[@class='tip'][2]/text()")[0] == u"学习经历": - user["education"] = self.selector.xpath( + user.education = self.selector.xpath( "//div[@class='c'][4]/text()")[0][1:].replace( u"\xa0", u" ") if self.selector.xpath( "//div[@class='tip'][3]/text()")[0] == u"工作经历": - user["work"] = self.selector.xpath( + user.work = self.selector.xpath( "//div[@class='c'][5]/text()")[0][1:].replace( u"\xa0", u" ") elif self.selector.xpath( "//div[@class='tip'][2]/text()")[0] == u"工作经历": - user["work"] = self.selector.xpath( + user.work = self.selector.xpath( "//div[@class='c'][4]/text()")[0][1:].replace( u"\xa0", u" ") return user diff --git a/weibo_spider/printer.py b/weibo_spider/printer.py index b1fa15d6..7d2bb49e 100644 --- a/weibo_spider/printer.py +++ b/weibo_spider/printer.py @@ -12,12 +12,3 @@ def print_one_weibo(weibo): print(u"评论数:%d" % weibo["comment_num"]) print(u"url:https://weibo.cn/comment/%s" % weibo["id"]) print("-" * 100) - - -def print_user_info(user): - """打印微博用户信息""" - print(u"用户昵称: %s" % user["nickname"]) - print(u"用户id: %s" % user["id"]) - print(u"微博数: %d" % user["weibo_num"]) - print(u"关注数: %d" % user["following"]) - print(u"粉丝数: %d" % user["followers"]) diff --git a/weibo_spider/user.py b/weibo_spider/user.py new file mode 100644 index 00000000..29e47f3d --- /dev/null +++ b/weibo_spider/user.py @@ -0,0 +1,29 @@ +class User: + def __init__(self): + self.id = '' + + self.nickname = '' + + self.gender = '' + self.location = '' + self.birthday = '' + self.description = '' + self.verified_reason = '' + self.talent = '' + + self.education = '' + self.work = '' + + self.weibo_num = 0 + self.following = 0 + self.followers = 0 + + def __str__(self): + """打印微博用户信息""" + result = "" + result += u"用户昵称: %s\n" % self.nickname + result += u"用户id: %s\n" % self.id + result += u"微博数: %d\n" % self.weibo_num + result += u"关注数: %d\n" % self.following + result += u"粉丝数: %d\n" % self.followers + return result diff --git a/weibo_spider/weibo_spider.py b/weibo_spider/weibo_spider.py index f122fdb7..d944d4a4 100644 --- a/weibo_spider/weibo_spider.py +++ b/weibo_spider/weibo_spider.py @@ -13,8 +13,9 @@ from absl import app, flags from tqdm import tqdm -from . import config_util, datetime_util, printer +from . import config_util, datetime_util from .parser import IndexParser, PageParser +from .user import User FLAGS = flags.FLAGS @@ -40,6 +41,7 @@ def __init__(self, config): "video_download"] # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 self.cookie = {"Cookie": config["cookie"]} self.mysql_config = config.get("mysql_config") # MySQL数据库连接配置,可以不填 + user_id_list = config["user_id_list"] if not isinstance(user_id_list, list): if FLAGS.user_id_list is not None: @@ -60,9 +62,8 @@ def __init__(self, config): self.user_config_list = user_config_list # 要爬取的微博用户的user_config列表 self.user_config = {} # 用户配置,包含用户id和since_date self.start_time = "" # 获取用户第一条微博时的时间 - self.user = {} # 存储爬取到的用户信息 + self.user = User() # 存储爬取到的用户信息 self.got_num = 0 # 存储爬取到的微博数 - self.weibo = [] # 存储爬取到的所有微博信息 self.weibo_id_list = [] # 存储爬取到的所有微博id def write_weibo(self, weibos): @@ -103,8 +104,8 @@ def get_weibo_info(self): self.weibo_id_list) # 获取第page页的全部微博 print(u"{}已获取{}({})的第{}页微博{}".format( "-" * 30, - self.user["nickname"], - self.user["id"], + self.user.nickname, + self.user.id, page, "-" * 30, )) @@ -131,14 +132,14 @@ def _get_filepath(self, type): file_dir = FLAGS.output_dir else: file_dir = (os.getcwd() + os.sep + "weibo" + os.sep + - self.user["nickname"]) + self.user.nickname) if type == "img" or type == "video": file_dir = file_dir + os.sep + type if not os.path.isdir(file_dir): os.makedirs(file_dir) if type == "img" or type == "video": return file_dir - file_path = file_dir + os.sep + self.user["id"] + "." + type + file_path = file_dir + os.sep + self.user.id + "." + type return file_path except Exception as e: print("Error: ", e) @@ -147,7 +148,6 @@ def _get_filepath(self, type): def initialize_info(self, user_config): """初始化爬虫信息""" self.got_num = 0 - self.weibo = [] self.user_config = user_config self.weibo_id_list = [] @@ -191,7 +191,7 @@ def start(self): try: for user_config in self.user_config_list: self.get_user_info(user_config["user_uri"]) - printer.print_user_info(self.user) + print(self.user) print("*" * 100) self.initialize_info(user_config) @@ -212,7 +212,7 @@ def start(self): config_util.update_user_config_file( self.user_config_file_path, self.user_config["user_uri"], - self.user["nickname"], + self.user.nickname, self.start_time, ) except Exception as e: diff --git a/weibo_spider/writer/json_writer.py b/weibo_spider/writer/json_writer.py index 3f438798..e55d72c7 100644 --- a/weibo_spider/writer/json_writer.py +++ b/weibo_spider/writer/json_writer.py @@ -14,7 +14,7 @@ def write_user(self, user): def _update_json_data(self, data, weibo_info): """更新要写入json结果文件中的数据,已经存在于json中的信息更新为最新值,不存在的信息添加到data中""" - data["user"] = self.user + data["user"] = self.user.__dict__ if data.get("weibo"): is_new = 1 # 待写入微博是否全部为新微博,即待写入微博与json中的数据不重复 for old in data["weibo"]: diff --git a/weibo_spider/writer/mongo_writer.py b/weibo_spider/writer/mongo_writer.py index ded49366..f7350cbe 100644 --- a/weibo_spider/writer/mongo_writer.py +++ b/weibo_spider/writer/mongo_writer.py @@ -33,7 +33,7 @@ def write_weibo(self, weibos): """将爬取的微博信息写入MongoDB数据库""" weibo_list = [] for w in weibos: - w["user_id"] = self.user["id"] + w["user_id"] = self.user.id weibo_list.append(w) self._info_to_mongodb("weibo", weibo_list) print(u"%d条微博写入MongoDB数据库完毕" % len(weibos)) @@ -41,6 +41,6 @@ def write_weibo(self, weibos): def write_user(self, user): """将爬取的用户信息写入MongoDB数据库""" self.user = user - user_list = [user] + user_list = [user.__dict__] self._info_to_mongodb("user", user_list) - print(u"%s信息写入MongoDB数据库完毕" % user["nickname"]) + print(u"%s信息写入MongoDB数据库完毕" % user.nickname) diff --git a/weibo_spider/writer/mysql_writer.py b/weibo_spider/writer/mysql_writer.py index 56bd4909..580d3f1e 100644 --- a/weibo_spider/writer/mysql_writer.py +++ b/weibo_spider/writer/mysql_writer.py @@ -96,7 +96,7 @@ def write_weibo(self, weibos): weibo_list = [] info_list = copy.deepcopy(weibos) for weibo in info_list: - weibo["user_id"] = self.user["id"] + weibo["user_id"] = self.user.id weibo_list.append(weibo) self._mysql_insert("weibo", weibo_list) print(u"%d条微博写入MySQL数据库完毕" % len(weibos)) @@ -124,5 +124,5 @@ def write_user(self, user): PRIMARY KEY (id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" self._mysql_create_table(create_table) - self._mysql_insert("user", [user]) - print(u"%s信息写入MySQL数据库完毕" % user["nickname"]) + self._mysql_insert("user", [user.__dict__]) + print(u"%s信息写入MySQL数据库完毕" % user.nickname) diff --git a/weibo_spider/writer/txt_writer.py b/weibo_spider/writer/txt_writer.py index 24dbc6b9..bba166c3 100644 --- a/weibo_spider/writer/txt_writer.py +++ b/weibo_spider/writer/txt_writer.py @@ -24,12 +24,12 @@ def __init__(self, file_path, filter): def write_user(self, user): self.user = user user_info = "\n".join( - [v + ":" + str(self.user[k]) for k, v in self.user_desc]) + [v + ":" + str(self.user.__dict__[k]) for k, v in self.user_desc]) with open(self.file_path, "ab") as f: f.write((self.user_header + ":\n" + user_info + "\n\n").encode( sys.stdout.encoding)) - print(u"%s信息写入txt文件完毕,保存路径:%s" % (user["nickname"], self.file_path)) + print(u"%s信息写入txt文件完毕,保存路径:%s" % (self.user.nickname, self.file_path)) def write_weibo(self, weibo): """将爬取的信息写入txt文件""" From 0775d33249efa5abbb9f62ca9853f8debd371269 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Sun, 14 Jun 2020 23:23:48 +0800 Subject: [PATCH 209/363] refactor[fix #160]: group info about one weibo to class Weibo. --- weibo_spider/downloader/downloader.py | 4 +- weibo_spider/downloader/img_downloader.py | 6 +-- weibo_spider/downloader/video_downloader.py | 4 +- weibo_spider/parser/page_parser.py | 44 ++++++++++----------- weibo_spider/printer.py | 14 ------- weibo_spider/weibo.py | 33 ++++++++++++++++ weibo_spider/writer/csv_writer.py | 30 ++++++-------- weibo_spider/writer/json_writer.py | 2 +- weibo_spider/writer/mongo_writer.py | 4 +- weibo_spider/writer/mysql_writer.py | 9 ++++- weibo_spider/writer/txt_writer.py | 5 ++- 11 files changed, 88 insertions(+), 67 deletions(-) delete mode 100644 weibo_spider/printer.py create mode 100644 weibo_spider/weibo.py diff --git a/weibo_spider/downloader/downloader.py b/weibo_spider/downloader/downloader.py index 38f8f081..568160d2 100644 --- a/weibo_spider/downloader/downloader.py +++ b/weibo_spider/downloader/downloader.py @@ -55,8 +55,8 @@ def download_files(self, weibos): try: print(u"即将进行%s下载" % self.describe) for w in tqdm(weibos, desc="Download progress"): - if w[self.key] != u"无": - self.handle_download(w[self.key], w) + if getattr(w, self.key) != u"无": + self.handle_download(getattr(w, self.key), w) print(u"%s下载完毕,保存路径:" % self.describe) print(self.file_dir) except Exception as e: diff --git a/weibo_spider/downloader/img_downloader.py b/weibo_spider/downloader/img_downloader.py index 88e9656d..03572258 100644 --- a/weibo_spider/downloader/img_downloader.py +++ b/weibo_spider/downloader/img_downloader.py @@ -13,7 +13,7 @@ def __init__(self, file_dir): def handle_download(self, urls, w): """处理下载相关操作""" - file_prefix = w["publish_time"][:11].replace("-", "") + "_" + w["id"] + file_prefix = w.publish_time[:11].replace("-", "") + "_" + w.id if "," in urls: url_list = urls.split(",") for i, url in enumerate(url_list): @@ -24,7 +24,7 @@ def handle_download(self, urls, w): file_suffix = url[index:] file_name = file_prefix + "_" + str(i + 1) + file_suffix file_path = self.file_dir + os.sep + file_name - self.download_one_file(url, file_path, w["id"]) + self.download_one_file(url, file_path, w.id) else: index = urls.rfind(".") if len(urls) - index > 5: @@ -33,4 +33,4 @@ def handle_download(self, urls, w): file_suffix = urls[index:] file_name = file_prefix + file_suffix file_path = self.file_dir + os.sep + file_name - self.download_one_file(urls, file_path, w["id"]) + self.download_one_file(urls, file_path, w.id) diff --git a/weibo_spider/downloader/video_downloader.py b/weibo_spider/downloader/video_downloader.py index caa21e91..ae9029e8 100644 --- a/weibo_spider/downloader/video_downloader.py +++ b/weibo_spider/downloader/video_downloader.py @@ -13,8 +13,8 @@ def __init__(self, file_dir): def handle_download(self, urls, w): """处理下载相关操作""" - file_prefix = w["publish_time"][:11].replace("-", "") + "_" + w["id"] + file_prefix = w.publish_time[:11].replace("-", "") + "_" + w.id file_suffix = ".mp4" file_name = file_prefix + file_suffix file_path = self.file_dir + os.sep + file_name - self.download_one_file(urls, file_path, w["id"]) + self.download_one_file(urls, file_path, w.id) diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index aaeb551a..2411588c 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -1,12 +1,12 @@ import re import sys import traceback -from collections import OrderedDict from datetime import datetime, timedelta import requests -from .. import datetime_util, printer +from .. import datetime_util +from ..weibo import Weibo from .comment_parser import CommentParser from .parser import Parser from .mblog_picAll_parser import MblogPicAllParser @@ -32,19 +32,19 @@ def get_one_page(self, since_date, weibo_id_list): for i in range(0, len(info) - 2): weibo = self.get_one_weibo(info[i]) if weibo: - if weibo["id"] in weibo_id_list: + if weibo.id in weibo_id_list: continue publish_time = datetime_util.str_to_time( - weibo["publish_time"]) + weibo.publish_time) if publish_time < since_date: if self.is_pinned_weibo(info[i]): continue else: return weibos, weibo_id_list - printer.print_one_weibo(weibo) + print(weibo) weibos.append(weibo) - weibo_id_list.append(weibo["id"]) + weibo_id_list.append(weibo.id) return weibos, weibo_id_list except Exception as e: print("Error: ", e) @@ -289,29 +289,29 @@ def is_pinned_weibo(self, info): def get_one_weibo(self, info): """获取一条微博的全部信息""" try: - weibo = OrderedDict() + weibo = Weibo() is_original = self.is_original(info) if (not self.filter) or is_original: - weibo["id"] = info.xpath("@id")[0][2:] - weibo["content"] = self.get_weibo_content(info, - is_original) # 微博内容 - weibo["article_url"] = self.get_article_url(info) # 头条文章url + weibo.id = info.xpath("@id")[0][2:] + weibo.content = self.get_weibo_content(info, + is_original) # 微博内容 + weibo.article_url = self.get_article_url(info) # 头条文章url picture_urls = self.get_picture_urls(info, is_original) - weibo["original_pictures"] = picture_urls[ + weibo.original_pictures = picture_urls[ "original_pictures"] # 原创图片url if not self.filter: - weibo["retweet_pictures"] = picture_urls[ + weibo.retweet_pictures = picture_urls[ "retweet_pictures"] # 转发图片url - weibo["original"] = is_original # 是否原创微博 - weibo["video_url"] = self.get_video_url(info, - is_original) # 微博视频url - weibo["publish_place"] = self.get_publish_place(info) # 微博发布位置 - weibo["publish_time"] = self.get_publish_time(info) # 微博发布时间 - weibo["publish_tool"] = self.get_publish_tool(info) # 微博发布工具 + weibo.original = is_original # 是否原创微博 + weibo.video_url = self.get_video_url(info, + is_original) # 微博视频url + weibo.publish_place = self.get_publish_place(info) # 微博发布位置 + weibo.publish_time = self.get_publish_time(info) # 微博发布时间 + weibo.publish_tool = self.get_publish_tool(info) # 微博发布工具 footer = self.get_weibo_footer(info) - weibo["up_num"] = footer["up_num"] # 微博点赞数 - weibo["retweet_num"] = footer["retweet_num"] # 转发数 - weibo["comment_num"] = footer["comment_num"] # 评论数 + weibo.up_num = footer["up_num"] # 微博点赞数 + weibo.retweet_num = footer["retweet_num"] # 转发数 + weibo.comment_num = footer["comment_num"] # 评论数 else: weibo = None print(u"正在过滤转发微博") diff --git a/weibo_spider/printer.py b/weibo_spider/printer.py deleted file mode 100644 index 7d2bb49e..00000000 --- a/weibo_spider/printer.py +++ /dev/null @@ -1,14 +0,0 @@ -# -*- coding: UTF-8 -*- - - -def print_one_weibo(weibo): - """打印一条微博""" - print(weibo["content"]) - print(u"微博发布位置:%s" % weibo["publish_place"]) - print(u"发布发布时间:%s" % weibo["publish_time"]) - print(u"发布发布工具:%s" % weibo["publish_tool"]) - print(u"点赞数:%d" % weibo["up_num"]) - print(u"转发数:%d" % weibo["retweet_num"]) - print(u"评论数:%d" % weibo["comment_num"]) - print(u"url:https://weibo.cn/comment/%s" % weibo["id"]) - print("-" * 100) diff --git a/weibo_spider/weibo.py b/weibo_spider/weibo.py new file mode 100644 index 00000000..44be0ff4 --- /dev/null +++ b/weibo_spider/weibo.py @@ -0,0 +1,33 @@ +class Weibo: + def __init__(self): + self.id = '' + self.user_id = '' + + self.content = '' + self.article_url = '' + + self.original_pictures = [] + self.retweet_pictures = None + self.original = None + self.video_url = '' + + self.publish_place = '' + self.publish_time = '' + self.publish_tool = '' + + self.up_num = 0 + self.retweet_num = 0 + self.comment_num = 0 + + def __str__(self): + """打印一条微博""" + result = self.content + '\n' + result += u"微博发布位置:%s\n" % self.publish_place + result += u"发布发布时间:%s\n" % self.publish_time + result += u"发布发布工具:%s\n" % self.publish_tool + result += u"点赞数:%d\n" % self.up_num + result += u"转发数:%d\n" % self.retweet_num + result += u"评论数:%d\n" % self.comment_num + result += u"url:https://weibo.cn/comment/%s\n" % self.id + result += "-" * 100 + "\n" + return result diff --git a/weibo_spider/writer/csv_writer.py b/weibo_spider/writer/csv_writer.py index ecb879fa..eb0e0b4e 100644 --- a/weibo_spider/writer/csv_writer.py +++ b/weibo_spider/writer/csv_writer.py @@ -8,27 +8,22 @@ class CsvWriter(Writer): def __init__(self, file_path, filter): self.file_path = file_path - result_headers = [ - "微博id", - "微博正文", - "头条文章url", - "原始图片url", - "微博视频url", - "发布位置", - "发布时间", - "发布工具", - "点赞数", - "转发数", - "评论数", - ] + self.result_headers = [('微博id', 'id'), ('微博正文', 'content'), + ('头条文章url', 'article_url'), + ('原始图片url', 'original_pictures'), + ('微博视频url', 'video_url'), + ('发布位置', 'publish_place'), + ('发布时间', 'publish_time'), + ('发布工具', 'publish_tool'), ('点赞数', 'up_num'), + ('转发数', 'retweet_num'), ('评论数', 'comment_num')] if not filter: - result_headers.insert(4, "被转发微博原始图片url") - result_headers.insert(5, "是否为原创微博") + self.result_headers.insert(4, ('被转发微博原始图片url', 'retweet_pictures')) + self.result_headers.insert(5, ('是否为原创微博', 'original')) try: with open(self.file_path, "a", encoding="utf-8-sig", newline="") as f: writer = csv.writer(f) - writer.writerows([result_headers]) + writer.writerows([[kv[0] for kv in self.result_headers]]) except Exception as e: print("Error: ", e) traceback.print_exc() @@ -39,7 +34,8 @@ def write_user(self, user): def write_weibo(self, weibos): """将爬取的信息写入csv文件""" try: - result_data = [w.values() for w in weibos] + result_data = [[w.__dict__[kv[1]] for kv in self.result_headers] + for w in weibos] with open(self.file_path, "a", encoding="utf-8-sig", newline="") as f: writer = csv.writer(f) diff --git a/weibo_spider/writer/json_writer.py b/weibo_spider/writer/json_writer.py index e55d72c7..7d1dfbff 100644 --- a/weibo_spider/writer/json_writer.py +++ b/weibo_spider/writer/json_writer.py @@ -43,7 +43,7 @@ def write_weibo(self, weibos): if os.path.isfile(self.file_path): with codecs.open(self.file_path, "r", encoding="utf-8") as f: data = json.load(f) - data = self._update_json_data(data, weibos) + data = self._update_json_data(data, [w.__dict__ for w in weibos]) with codecs.open(self.file_path, "w", encoding="utf-8") as f: f.write(json.dumps(data, indent=4, ensure_ascii=False)) print(u"%d条微博写入json文件完毕,保存路径:%s" % (len(weibos), self.file_path)) diff --git a/weibo_spider/writer/mongo_writer.py b/weibo_spider/writer/mongo_writer.py index f7350cbe..d21aa445 100644 --- a/weibo_spider/writer/mongo_writer.py +++ b/weibo_spider/writer/mongo_writer.py @@ -33,8 +33,8 @@ def write_weibo(self, weibos): """将爬取的微博信息写入MongoDB数据库""" weibo_list = [] for w in weibos: - w["user_id"] = self.user.id - weibo_list.append(w) + w.user_id = self.user.id + weibo_list.append(w.__dict__) self._info_to_mongodb("weibo", weibo_list) print(u"%d条微博写入MongoDB数据库完毕" % len(weibos)) diff --git a/weibo_spider/writer/mysql_writer.py b/weibo_spider/writer/mysql_writer.py index 580d3f1e..070cee1e 100644 --- a/weibo_spider/writer/mysql_writer.py +++ b/weibo_spider/writer/mysql_writer.py @@ -46,6 +46,11 @@ def _mysql_insert(self, table, data_list): """向MySQL表插入或更新数据""" import pymysql if len(data_list) > 0: + # We use this to filter out unset values. + data_list = [{k: v + for k, v in data.items() if v is not None} + for data in data_list] + keys = ", ".join(data_list[0].keys()) values = ", ".join(["%s"] * len(data_list[0])) connection = pymysql.connect(**self.mysql_config) @@ -96,8 +101,8 @@ def write_weibo(self, weibos): weibo_list = [] info_list = copy.deepcopy(weibos) for weibo in info_list: - weibo["user_id"] = self.user.id - weibo_list.append(weibo) + weibo.user_id = self.user.id + weibo_list.append(weibo.__dict__) self._mysql_insert("weibo", weibo_list) print(u"%d条微博写入MySQL数据库完毕" % len(weibos)) diff --git a/weibo_spider/writer/txt_writer.py b/weibo_spider/writer/txt_writer.py index bba166c3..24ceb66d 100644 --- a/weibo_spider/writer/txt_writer.py +++ b/weibo_spider/writer/txt_writer.py @@ -42,8 +42,9 @@ def write_weibo(self, weibo): try: temp_result = [] for w in weibo: - temp_result.append(w["content"] + "\n" + "\n".join( - [v + ":" + str(w[k]) for k, v in self.weibo_desc])) + temp_result.append(w.__dict__["content"] + "\n" + "\n".join( + [v + ":" + str(w.__dict__[k]) + for k, v in self.weibo_desc])) result = "\n\n".join(temp_result) + "\n\n" with open(self.file_path, "ab") as f: From 04920154b4bd78bd3042b20070db794896c59147 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Sun, 14 Jun 2020 23:51:34 +0800 Subject: [PATCH 210/363] doc: update README.md. Now the weibo_spider.py use Spider rather Weibo to implement the main logic. --- README.md | 48 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index eefba38d..adf75f1f 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ 本程序可以连续爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入**文件**或**数据库**。写入信息几乎包括用户微博的所有数据,包括**用户信息**和**微博信息**两大类。因为内容太多,这里不再赘述,详细内容见[获取到的字段](#获取到的字段)。如果只需要用户信息,可以通过设置实现只爬取微博用户信息的功能。本程序需设置cookie来获取微博访问权限,后面会讲解[如何获取cookie](#如何获取cookie)。如果不想设置cookie,可以使用[免cookie版](https://github.com/dataabc/weibo-crawler),二者功能类似。 具体的写入文件类型如下: + - 写入**txt文件**(默认) - 写入**csv文件**(默认) - 写入**json文件**(可选) @@ -17,22 +18,31 @@ ## 内容列表 -- [获取到的字段](#获取到的字段) -- [示例](#示例) -- [运行环境](#运行环境) -- [使用说明](#使用说明) -- [个性化定制程序(可选)](#个性化定制程序可选) -- [定期自动爬取微博(可选)](#定期自动爬取微博可选) -- [如何获取cookie](#如何获取cookie) -- [如何获取user_id](#如何获取user_id) -- [相关项目](#相关项目) -- [注意事项](#注意事项) +- [Weibo Spider](#weibo-spider) + - [内容列表](#内容列表) + - [获取到的字段](#获取到的字段) + - [示例](#示例) + - [运行环境](#运行环境) + - [使用说明](#使用说明) + - [0.版本](#0版本) + - [1.安装程序](#1安装程序) + - [源码安装](#源码安装) + - [pip安装](#pip安装) + - [2.程序设置](#2程序设置) + - [3.运行程序](#3运行程序) + - [个性化定制程序(可选)](#个性化定制程序可选) + - [定期自动爬取微博(可选)](#定期自动爬取微博可选) + - [如何获取cookie](#如何获取cookie) + - [如何获取user_id](#如何获取user_id) + - [相关项目](#相关项目) + - [注意事项](#注意事项) ## 获取到的字段 本部分为爬取到的字段信息说明,为了与[免cookie版](https://github.com/dataabc/weibo-crawler)区分,下面将两者爬取到的信息都列出来。如果是免cookie版所特有的信息,会有免cookie标注,没有标注的为二者共有的信息。 **用户信息** + - 用户id:微博用户id,如"1669879400",其实这个字段本来就是已知字段 - 昵称:用户昵称,如"Dear-迪丽热巴" - 性别:微博用户性别 @@ -54,8 +64,11 @@ - 是否认证(免cookie版):用户是否认证,为布尔类型 - 认证类型(免cookie版):用户认证类型,如个人认证、企业认证、政府认证等 - 认证信息:为认证用户特有,用户信息栏显示的认证信息 + *** + **微博信息** + - 微博id:微博唯一标志 - 微博内容:微博正文 - 头条文章url:微博中头条文章的url,若微博中不存在头条文章,则值为'' @@ -115,10 +128,13 @@ $ python3 -m pip install weibo-spider ### 3.运行程序 **源码安装**的用户可以在weiboSpider目录运行如下命令,**pip安装**的用户可以在任意有写权限的目录运行如下命令 + ```bash $ python3 -m weibo_spider ``` + 第一次执行,会自动在当前目录创建config.json配置文件,配置好后执行同样的命令就可以获取微博了。如果你已经有config.json文件了,也可以通过config_path参数配置config.json路径,运行程序,命令行如下: + ```bash $ python3 -m weibo_spider --config_path="config.json" ``` @@ -127,16 +143,18 @@ $ python3 -m weibo_spider --config_path="config.json" 本部分为可选部分,如果不需要个性化定制程序或添加新功能,可以忽略此部分。 -本程序主体代码位于weibo_spider.py文件,程序主体是一个Weibo类,上述所有功能都是通过在main函数调用Weibo类实现的,默认的调用代码如下: +本程序主体代码位于weibo_spider.py文件,程序主体是一个 Spider 类,上述所有功能都是通过在main函数调用 Spider 类实现的,默认的调用代码如下: + ```python config = get_config() - wb = Weibo(config) + wb = Spider(config) wb.start() # 爬取微博信息 ``` -用户可以按照自己的需求调用或修改Weibo类。通过执行本程序,我们可以得到很多信息。 + +用户可以按照自己的需求调用或修改 Spider 类。通过执行本程序,我们可以得到很多信息。
- + 点击查看详情 - wb.user['nickname']:用户昵称; @@ -154,11 +172,13 @@ $ python3 -m weibo_spider --config_path="config.json"
**wb.weibo**:除不包含上述信息外,wb.weibo包含爬取到的所有微博信息,如**微博id**、**微博正文**、**原始图片url**、**发布位置**、**发布时间**、**发布工具**、**点赞数**、**转发数**、**评论数**等。如果爬的是全部微博(原创+转发),除上述信息之外,还包含被**转发微博原始图片url**、**是否为原创微博**等。wb.weibo是一个列表,包含了爬取的所有微博信息。wb.weibo[0]为爬取的第一条微博,wb.weibo[1]为爬取的第二条微博,以此类推。当filter=1时,wb.weibo[0]为爬取的第一条**原创**微博,以此类推。wb.weibo[0]['id']为第一条微博的id,wb.weibo[0]['content']为第一条微博的正文,wb.weibo[0]['publish_time']为第一条微博的发布时间,还有其它很多信息不在赘述,大家可以点击下面的"详情"查看具体用法。 +
详情 若目标微博用户存在微博,则: + - id:存储微博id。如wb.weibo[0]['id']为最新一条微博的id; - content:存储微博正文。如wb.weibo[0]['content']为最新一条微博的正文; - article_url:存储微博中头条文章的url。如wb.weibo[0]['article_url']为最新一条微博的头条文章url,若微博中不存在头条文章,则值为''; From a99fd137e57d5b75b95c675ee46fb04e42bf99bf Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Mon, 15 Jun 2020 01:25:59 +0800 Subject: [PATCH 211/363] Create FAQ.md --- docs/FAQ.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 docs/FAQ.md diff --git a/docs/FAQ.md b/docs/FAQ.md new file mode 100644 index 00000000..7a9d5d5c --- /dev/null +++ b/docs/FAQ.md @@ -0,0 +1,30 @@ +## 常见问题 + +### 程序运行出错,错误提示中包含“'NoneType' object”字样,如何解决? +这是最常见的问题之一。出错原因是爬取速度太快,被暂时限制了。限制可能包含爬虫账号限制和ip限制。一般情况下,一段时间后限制会自动解除。可通过降低爬取速度避免被限制,具体修改weibo_spider.py文件中get_weibo_info方法的如下代码: +``` + if (page - page1) % random_pages == 0 and page < page_num: + sleep(random.randint(6, 10)) + page1 = page + random_pages = random.randint(1, 5) +``` +上面的意思是每爬取1到5页,随机等待6到10秒。可以通过加快暂停频率(减小random_pages值)或增加等待时间(加大sleep内的值)避免被限制。 +如果你设置了只爬取用户信息(不爬用户的微博),则需修改weibo_spider.py文件中的start方法,原来的代码是这样的: +``` + for user_config in self.user_config_list: + ...... +``` +修改后的代码是这样的: +``` + user_count = 0 + user_count1 = random.randint(1, 5) + random_users = random.randint(1, 5) + for user_config in self.user_config_list: + if (user_count - user_count1) % random_users == 0: + sleep(random.randint(6, 10)) + user_count1 = user_count + random_users = random.randint(1, 5) + user_count += 1 + ...... +``` +上面的意思是每爬1到5个用户,随机等待6到10秒,你可以根据实际情况,修改代码中的数字。 From 3c4d3a51f64cc6e26fc36396ab687e92ea3b2d89 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 16 Jun 2020 00:20:28 +0800 Subject: [PATCH 212/363] Update FAQ.md --- docs/FAQ.md | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index 7a9d5d5c..5ddd2ca7 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -1,6 +1,6 @@ ## 常见问题 -### 程序运行出错,错误提示中包含“'NoneType' object”字样,如何解决? +### 1.程序运行出错,错误提示中包含“'NoneType' object”字样,如何解决? 这是最常见的问题之一。出错原因是爬取速度太快,被暂时限制了。限制可能包含爬虫账号限制和ip限制。一般情况下,一段时间后限制会自动解除。可通过降低爬取速度避免被限制,具体修改weibo_spider.py文件中get_weibo_info方法的如下代码: ``` if (page - page1) % random_pages == 0 and page < page_num: @@ -28,3 +28,25 @@ ...... ``` 上面的意思是每爬1到5个用户,随机等待6到10秒,你可以根据实际情况,修改代码中的数字。 + +### 2.如何获取微博评论? +因为限制,只能获取一部分评论,无法获取全部,因此暂时没有添加获取评论功能的计划。 + +### 3.有的长微博正文只能获取一部分内容,如何解决? +程序是可以获取长微博全文的。程序首先在微博列表页获取微博,如果发现长微博(正文没有显示完整,以“全文”代替部分内容的微博),会先保存这个不全的内容,然后去该长微博的详情页尝试获取全文,如果获取成功,获取的内容就是微博文本;如果获取失败,等待若干秒重新获取;如果连续尝试5次都失败,就用上面不全的内容代替。这样做的原因是避免因部分长微博获取失败而卡住。如果想尝试更多次,可以修改comment_parser.py文件get_long_weibo方法内for循环的次数。 + +### 4.如何按指定关键词获取微博? +请使用[weibo-search](https://github.com/dataabc/weibo-search)。该程序可以连续获取一个或多个微博关键词搜索结果,并将结果写入文件(可选)、数据库(可选)等。所谓微博关键词搜索即:搜索正文中包含指定关键词的微博,可以指定搜索的时间范围。对于非常热门的关键词,一天的时间范围,可以获得1000万以上的搜索结果,N天的时间范围就可以获得1000万 X N搜索结果。对于大多数关键词,一天产生的相应微博数量应该在1000万条以下,因此可以说该程序可以获得大部分关键词的全部或近似全部的搜索结果。而且该程序可以获得搜索结果的所有信息,本程序获得的微博信息该程序都能获得。 + +### 5.如何获取微博用户关注列表中用户的user_id? +请使用[weibo-follow](https://github.com/dataabc/weibo-follow)。该程序可以利用一个user_id,获取该user_id微博用户关注人的user_id,一个user_id最多可以获得200个user_id,并写入user_id_list.txt文件。程序支持读文件,利用这200个user_id,可以获得最多200X200=40000个user_id。再利用这40000个user_id可以得到40000X200=8000000个user_id,如此反复,以此类推,可以获得大量user_id。本项目也支持读文件,将上述程序的结果文件user_id_list.txt路径赋值给本项目config.json的user_id_list参数,就可以获得这些user_id用户所发布的大量微博。 + +### 6.如何获取自己的微博? +修改info_parser.py和page_parser.py中__init__方法,将前者的self.url修改为: +``` + self.url = "https://weibo.cn/%s/profile" % (user_id) +``` +后者的self.url修改为: +``` + self.url = "https://weibo.cn/%s/profile?page=%d" % (user_uri, page) +``` From c6d2d8e820ed8e06e7ca25383812a0acacda443a Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Thu, 18 Jun 2020 23:41:27 +0800 Subject: [PATCH 213/363] fix: minor fix of weibo printer to only include necessary info. --- weibo_spider/parser/page_parser.py | 1 + weibo_spider/weibo.py | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index 2411588c..326892f2 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -43,6 +43,7 @@ def get_one_page(self, since_date, weibo_id_list): else: return weibos, weibo_id_list print(weibo) + print("-" * 100) weibos.append(weibo) weibo_id_list.append(weibo.id) return weibos, weibo_id_list diff --git a/weibo_spider/weibo.py b/weibo_spider/weibo.py index 44be0ff4..a5a1d98f 100644 --- a/weibo_spider/weibo.py +++ b/weibo_spider/weibo.py @@ -23,11 +23,10 @@ def __str__(self): """打印一条微博""" result = self.content + '\n' result += u"微博发布位置:%s\n" % self.publish_place - result += u"发布发布时间:%s\n" % self.publish_time - result += u"发布发布工具:%s\n" % self.publish_tool + result += u"发布时间:%s\n" % self.publish_time + result += u"发布工具:%s\n" % self.publish_tool result += u"点赞数:%d\n" % self.up_num result += u"转发数:%d\n" % self.retweet_num result += u"评论数:%d\n" % self.comment_num result += u"url:https://weibo.cn/comment/%s\n" % self.id - result += "-" * 100 + "\n" return result From 86d18ca815af48e9723f42a6c688cdf725d74f3f Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Thu, 18 Jun 2020 23:42:06 +0800 Subject: [PATCH 214/363] feat: add a mechanism to generate mock test data for url requests. --- weibo_spider/parser/util.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py index ec5b1f7d..3724c83e 100644 --- a/weibo_spider/parser/util.py +++ b/weibo_spider/parser/util.py @@ -1,15 +1,42 @@ import sys import traceback +import hashlib import requests from lxml import etree +# Set GENERATE_TEST_DATA to True when generating test data. +GENERATE_TEST_DATA = False +TEST_DATA_DIR = 'tests/testdata' +URL_MAP_FILE = "url_map.json" + + +def hash_url(url): + return hashlib.sha224(url.encode('utf8')).hexdigest() + def handle_html(cookie, url): """处理html""" try: - html = requests.get(url, cookies=cookie).content - selector = etree.HTML(html) + resp = requests.get(url, cookies=cookie) + + if GENERATE_TEST_DATA: + import io + import json + import os + + resp_file = os.path.join(TEST_DATA_DIR, "%s.html" % hash_url(url)) + with io.open(resp_file, "w") as f: + f.write(resp.text) + + with io.open(os.path.join(TEST_DATA_DIR, URL_MAP_FILE), 'r+') as f: + url_map = json.loads(f.read()) + url_map[url] = resp_file + f.seek(0) + f.write(json.dumps(url_map, indent=4, ensure_ascii=False)) + f.truncate() + + selector = etree.HTML(resp.content) return selector except Exception as e: print("Error: ", e) From 9810d424b96ae0c7e9d509a69768b7879b6759ab Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Thu, 18 Jun 2020 23:43:12 +0800 Subject: [PATCH 215/363] tests: add a test and related test data for page_parser. --- tests/__init__.py | 0 tests/test_page_parser.py | 44 +++++++++++++++++++ ...398d385c377a068b76eb95765f7020ffffd3e.html | 1 + ...74b5537dea736dfb34e48d8835203a45d2e67.html | 1 + ...55bca03cbf5988f7eac233a23d86b4fdf5ffd.html | 1 + ...6f19cfa5d674a610e8b442b1f83de7673ab49.html | 1 + ...ae1595186ac063fe5ec25cf2f98116ece83cb.html | 1 + ...66fa90afb2d54d19f8c898e164204a61bdf03.html | 1 + ...0f2cc77b3648abfa03580b9e0cdb61f1e618f.html | 1 + ...bd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html | 1 + tests/testdata/url_map.json | 10 +++++ 11 files changed, 62 insertions(+) create mode 100644 tests/__init__.py create mode 100644 tests/test_page_parser.py create mode 100644 tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html create mode 100644 tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html create mode 100644 tests/testdata/63a98849ec82b2c87ec55bca03cbf5988f7eac233a23d86b4fdf5ffd.html create mode 100644 tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html create mode 100644 tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html create mode 100644 tests/testdata/ca5f2a555e8d62f728c66fa90afb2d54d19f8c898e164204a61bdf03.html create mode 100644 tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html create mode 100644 tests/testdata/e97222acd5bc7d8d1bfbd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html create mode 100644 tests/testdata/url_map.json diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_page_parser.py b/tests/test_page_parser.py new file mode 100644 index 00000000..8c11cb40 --- /dev/null +++ b/tests/test_page_parser.py @@ -0,0 +1,44 @@ +from unittest.mock import patch, Mock +import json +import os + +from weibo_spider.parser.page_parser import PageParser +from weibo_spider.parser.util import TEST_DATA_DIR, URL_MAP_FILE + + +def mock_request_get_content(url, cookies): + with open(os.path.join(TEST_DATA_DIR, URL_MAP_FILE)) as f: + url_map = json.loads(f.read()) + resp_file = url_map[url] + mock = Mock() + with open(resp_file, "rb") as f: + mock.content = f.read() + return mock + + +@patch('requests.get', mock_request_get_content) +def test_page_parser(): + page_parser = PageParser(cookie="", + user_uri="1669879400", + page=2, + filter=True) + weibos, weibo_id_list = page_parser.get_one_page("2020-06-01", []) + assert (weibo_id_list == ['J4PGk4yMw', 'J4EUStJKu']) + assert (len(weibos) == 2) + assert (str(weibos[0]) == """生日动态 \xa0\n""" + """微博发布位置:无\n""" + """发布时间:2020-06-03 00:00\n""" + """发布工具:生日动态\n""" + """点赞数:1499637\n""" + """转发数:1000000\n""" + """评论数:1000000\n""" + """url:https://weibo.cn/comment/J4PGk4yMw\n""") + assert (str(weibos[1]) == + """#微博剧场# #周放设计淡黄的长裙# 这是一幅有声音的手稿#幸福触手可及# 绿洲 \xa0原图\xa0\n""" + """微博发布位置:无\n""" + """发布时间:2020-06-01 20:35\n""" + """发布工具:绿洲APP\n""" + """点赞数:419172\n""" + """转发数:1000000\n""" + """评论数:1000000\n""" + """url:https://weibo.cn/comment/J4EUStJKu\n""") diff --git a/tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html b/tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html new file mode 100644 index 00000000..9f980e99 --- /dev/null +++ b/tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html @@ -0,0 +1 @@ +Dear-迪丽热巴的微博
Dear-迪丽热巴的微博 加关注
 微博  相册 
生日动态  
赞[1499637] 转发[1000000] 评论[1000000] 收藏 06月03日 00:00 来自生日动态
炎炎夏日让每天的沐浴时光都变得尤其重要,精致的沙龙香相伴让沐浴也可以成为清新浪漫的享受!给大家@LUX力士 的沐浴小秘密分享,有力士植萃沐浴露,把沐浴变成“仪式感”!我的心选好物分享给你们啦 [笑而不语] LUX力士的微博视频  
赞[377571] 转发[1000000] 评论[1000000] 收藏 05月31日 10:59
#idoltube##周放vlog# 第二篇来啦!今天邀请大家走进生活,走进幸福的放放子一家~[喵喵]#幸福触手可及# Dear-迪丽热巴的微博视频  
赞[397951] 转发[1000000] 评论[1000000] 收藏 05月30日 19:02 来自国产剧集 · 视频社区
@法国娇韵诗 收到宠爱了~小娇的618#娇宠你有一套#,早晚护肤都靠它,超级喜欢这份宠爱!现在给全体爱丽丝们施法,希望你们都可以拥有这份让你变美的娇宠礼物哦~同款娇宠http://t.cn/A62cgDJp一起享用!  [组图共2张]
#微博剧场# 我为4A景区代言,酷飒周放的追剧邀请,你来吗? #4A景区触手可及#
@路易威登 PONT 9 手袋 陪你摩登一夏[嘻嘻]#LVPONT9#  [组图共3张]
#热巴手稿填色大赛#服装手稿填色游戏正式开启!图一出自迪迪子,图二出自放放子。迪迪子的面子就靠大家的后期填色了[微笑] 绿洲  [组图共2张]
图片 原图 
赞[733669] 转发[1000000] 评论[1000000] 收藏 05月27日 14:48 来自绿洲APP
转发了 护舒宝VM 的微博:还记得和宝宝陪着@Dear-迪丽热巴 走过的花路吗?谢谢阿丝们一直以来的陪伴[太开心][太开心]~为你甄选护舒宝天然纯棉卫生巾,给你透气亲肤的体验。现在上天猫超市购买,1套减25,第2套只要19.9。未来的花路,和宝宝一起用好物,守护热巴!#迪丽热巴[超话]#
图片 原图 赞[43521] 原文转发[1000000] 原文评论[13967]
转发理由:谢谢@护舒宝 和阿丝们的守护,每一刻都非常有意义。未来请继续指教啦~  
赞[418833] 转发[1000000] 评论[1000000] 收藏 05月26日 11:14
#idoltube##周放vlog# 放放子的第一支搞事业篇vlog已上线~约vlog的朋友们可以放下你们的号码牌了[可爱] #幸福触手可及# Dear-迪丽热巴的微博视频  
赞[450537] 转发[1000000] 评论[216934] 收藏 05月25日 20:53 来自影视剪辑 · 视频社区
下页 上页 首页  2/117页
TOP
彩版|触屏|语音
weibo.cn[06-18 23:38]
\ No newline at end of file diff --git a/tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html b/tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html new file mode 100644 index 00000000..858a8ee4 --- /dev/null +++ b/tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html @@ -0,0 +1 @@ +Dear-迪丽热巴的微博
头像
Dear-迪丽热巴VM 女/上海   加关注
认证:嘉行传媒签约演员 
一只喜欢默默表演的小透明。工作联系jaywalk@jaywalk.com.cn ...
私信 资料 操作 特别关注 送Ta会员
 微博  相册 
#一起热爱就现在#给你们康康我眼前的画面[嘻嘻] 绿洲
图片 原图 
赞[508797] 转发[1000000] 评论[391666] 收藏 06月17日 18:12 来自绿洲APP
刚收到我定制的亓那眼镜,猜猜定制了什么[doge]好奇?没关系,你们也可以拥有自己的定制眼镜。关注@QINA亓那眼镜 解锁6月限定惊喜,#时髦寻宝计划# 线上线下都安排了[偷笑]QINA亓那眼镜的微博视频  
赞[414547] 转发[1000000] 评论[1000000] 收藏 06月15日 10:09
#idoltube##周放vlog# 什么?放放子还有两副面孔呢?[喵喵] #幸福触手可及# Dear-迪丽热巴的微博视频  
赞[317796] 转发[1000000] 评论[514532] 收藏 06月14日 20:09 来自影视剪辑 · 视频社区
图片 原图 
赞[1150003] 转发[1000000] 评论[1000000] 收藏 06月12日 19:11 来自绿洲APP
言出必行,说了18张就是18张,送给七千万的你们 ~  [组图共18张]
放放子缺个快板[偷笑] 绿洲
图片 原图 
赞[571684] 转发[1000000] 评论[1000000] 收藏 06月08日 15:17 来自绿洲APP
转发了 WCS野生生物保护学会V 的微博:去年和亲善大使热巴@Dear-迪丽热巴 的特别回忆[心]。我们在藏北羌塘一起爬山,探访藏羚羊、雪豹、黑颈鹤的栖息地,感受野生动物保护工作的点滴。此时此刻,我们比以往更加重视与自然相处的方式,我们也从未如此迫切需要将想法付诸行动。热巴已经和我们@北京绿色阳光 站在一起,希望看完视频的你们,也...全文 赞[119295] 原文转发[1000000] 原文评论[38687]
转发理由:在羌塘的美好回忆~第一次来到这片独特的荒野,看到野生动物自由生活,还有一群快乐可爱的人在守护着它们。把这些美好留存下来,关注野生动物保护,积极行动,我们每个人都能贡献力量。[心]  
赞[554383] 转发[1000000] 评论[1000000] 收藏 06月05日 11:11
要开心。要充实。
#微博live秀# 28岁的直播~@Dear-迪丽热巴 的一直播(下载App->http://t.cn/RDUuslr 
赞[435625] 转发[23583] 评论[1000000] 收藏 06月03日 19:00 来自一直播Yi
下页  1/117页
TOP
彩版|触屏|语音
weibo.cn[06-18 23:37]
\ No newline at end of file diff --git a/tests/testdata/63a98849ec82b2c87ec55bca03cbf5988f7eac233a23d86b4fdf5ffd.html b/tests/testdata/63a98849ec82b2c87ec55bca03cbf5988f7eac233a23d86b4fdf5ffd.html new file mode 100644 index 00000000..ea9bdf4c --- /dev/null +++ b/tests/testdata/63a98849ec82b2c87ec55bca03cbf5988f7eac233a23d86b4fdf5ffd.html @@ -0,0 +1 @@ +微博
图片加载中... 1/18 原图
图片加载中... 2/18 原图
图片加载中... 3/18 原图
图片加载中... 4/18 原图
图片加载中... 5/18 原图
图片加载中... 6/18 原图
图片加载中... 7/18 原图
图片加载中... 8/18 原图
图片加载中... 9/18 原图
图片加载中... 10/18 原图
图片加载中... 11/18 原图
图片加载中... 12/18 原图
图片加载中... 13/18 原图
图片加载中... 14/18 原图
图片加载中... 15/18 原图
图片加载中... 16/18 原图
图片加载中... 17/18 原图
图片加载中... 18/18 原图
\ No newline at end of file diff --git a/tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html b/tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html new file mode 100644 index 00000000..9e6330f9 --- /dev/null +++ b/tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html @@ -0,0 +1 @@ +微博
图片加载中... 1/2 原图
图片加载中... 2/2 原图
\ No newline at end of file diff --git a/tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html b/tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html new file mode 100644 index 00000000..280d824a --- /dev/null +++ b/tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html @@ -0,0 +1 @@ +Dear-迪丽热巴的微博
头像
Dear-迪丽热巴VM 女/上海   加关注
认证:嘉行传媒签约演员 
一只喜欢默默表演的小透明。工作联系jaywalk@jaywalk.com.cn ...
私信 资料 操作 特别关注 送Ta会员
 微博  相册 
#一起热爱就现在#给你们康康我眼前的画面[嘻嘻] 绿洲
图片 原图 
赞[508798] 转发[1000000] 评论[391666] 收藏 06月17日 18:12 来自绿洲APP
刚收到我定制的亓那眼镜,猜猜定制了什么[doge]好奇?没关系,你们也可以拥有自己的定制眼镜。关注@QINA亓那眼镜 解锁6月限定惊喜,#时髦寻宝计划# 线上线下都安排了[偷笑]QINA亓那眼镜的微博视频  
赞[414547] 转发[1000000] 评论[1000000] 收藏 06月15日 10:09
#idoltube##周放vlog# 什么?放放子还有两副面孔呢?[喵喵] #幸福触手可及# Dear-迪丽热巴的微博视频  
赞[317796] 转发[1000000] 评论[514532] 收藏 06月14日 20:09 来自影视剪辑 · 视频社区
图片 原图 
赞[1150003] 转发[1000000] 评论[1000000] 收藏 06月12日 19:11 来自绿洲APP
言出必行,说了18张就是18张,送给七千万的你们 ~  [组图共18张]
放放子缺个快板[偷笑] 绿洲
图片 原图 
赞[571684] 转发[1000000] 评论[1000000] 收藏 06月08日 15:17 来自绿洲APP
转发了 WCS野生生物保护学会V 的微博:去年和亲善大使热巴@Dear-迪丽热巴 的特别回忆[心]。我们在藏北羌塘一起爬山,探访藏羚羊、雪豹、黑颈鹤的栖息地,感受野生动物保护工作的点滴。此时此刻,我们比以往更加重视与自然相处的方式,我们也从未如此迫切需要将想法付诸行动。热巴已经和我们@北京绿色阳光 站在一起,希望看完视频的你们,也...全文 赞[119295] 原文转发[1000000] 原文评论[38687]
转发理由:在羌塘的美好回忆~第一次来到这片独特的荒野,看到野生动物自由生活,还有一群快乐可爱的人在守护着它们。把这些美好留存下来,关注野生动物保护,积极行动,我们每个人都能贡献力量。[心]  
赞[554383] 转发[1000000] 评论[1000000] 收藏 06月05日 11:11
要开心。要充实。
#微博live秀# 28岁的直播~@Dear-迪丽热巴 的一直播(下载App->http://t.cn/RDUuslr 
赞[435625] 转发[23583] 评论[1000000] 收藏 06月03日 19:00 来自一直播Yi
下页  1/117页
TOP
彩版|触屏|语音
weibo.cn[06-18 23:37]
\ No newline at end of file diff --git a/tests/testdata/ca5f2a555e8d62f728c66fa90afb2d54d19f8c898e164204a61bdf03.html b/tests/testdata/ca5f2a555e8d62f728c66fa90afb2d54d19f8c898e164204a61bdf03.html new file mode 100644 index 00000000..2990dbe4 --- /dev/null +++ b/tests/testdata/ca5f2a555e8d62f728c66fa90afb2d54d19f8c898e164204a61bdf03.html @@ -0,0 +1 @@ +Dear-迪丽热巴的资料
头像
会员等级:7级 送Ta会员
微身份 语惊四座 七步成诗 谈笑风生 更多勋章
基本信息
昵称:Dear-迪丽热巴
认证:嘉行传媒签约演员 
性别:女
地区:上海
生日:双子座
认证信息:嘉行传媒签约演员 
简介:一只喜欢默默表演的小透明。工作联系jaywalk@jaywalk.com.cn 🍒
学习经历
·上海戏剧学院
工作经历
·嘉行传媒 
其他信息
互联网:http://weibo.com/u/1669879400
手机版:https://weibo.cn/u/1669879400
她的相册>>
TOP
彩版|触屏|语音
weibo.cn[06-18 23:37]
\ No newline at end of file diff --git a/tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html b/tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html new file mode 100644 index 00000000..bfb31fe9 --- /dev/null +++ b/tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html @@ -0,0 +1 @@ +Dear-迪丽热巴的微博
Dear-迪丽热巴的微博 加关注
 微博  相册 
粉色天空、闪耀夜色、浪漫爱意…我把我喜爱的元素和巴黎限定记忆全部定格在这一瓶#YSL反转巴黎#热爱限定中,第一次与YSL一起合作设计香水,在这#拦不住的夏天#把甜甜的曼陀罗花香送给你们,喜欢吗?💓  [组图共2张]
#幸福触手可及开播##幸福触手可及# 度量自身,方能修炼精彩人生。追梦不易,披荆斩棘。今晚八点@湖南卫视 和周放,一起守护梦想,书写初夏。
很高兴成为力士大中华区沐浴系列代言人,520就要到啦,大家快来接收告白福利哦!全新植萃泡泡沐浴露让每一位小仙女都能在浓密泡泡浴中拥有夏日嫩白肌,仙气香气都十足!关注@LUX力士 第一时间锁定新品哦!LUX力士的微博视频  
赞[741400] 转发[1000000] 评论[1000000] 收藏 05月19日 09:03
转发了 电视剧幸福触手可及VM 的微博:#幸福触手可及##幸福触手可及定档0519# 从没有一个时刻,幸福如此靠近,只因有你在身边[心]5月19日20:00锁定@湖南卫视 金鹰独播剧场,@优酷 @爱奇艺 @腾讯视频 24点同步更新,等你解锁初夏甜梦!
图片 原图 赞[129675] 原文转发[332651] 原文评论[6900]
转发理由:#幸福触手可及定档0519# 唯有热爱,不负韶华,为之全力以赴,才能成为更优秀的人。5月19日20:00锁定湖南卫视#幸福触手可及# ,愈挫愈勇的独立设计师周放来啦。  
赞[480417] 转发[57012] 评论[40966] 收藏 05月15日 20:23
哈哈哈哈哈哈👅
转发了 北京2022年冬奥会VM 的微博:【爱豆喊你来助力#北京2022#
花样滑冰,旋转跳跃 ,“迪丽”前行 @Dear-迪丽热巴 北京2022年冬奥会的微博视频
 赞[680201] 原文转发[1450782] 原文评论[50694]
转发理由:与我一起,关注花样滑冰,为中国健儿鼓劲加油[加油]  
赞[501777] 转发[1000000] 评论[1000000] 收藏 05月15日 10:20
转发了 央视网VM 的微博:【想看看战疫一线医护人员们的脸!#极限挑战致敬医护人员#】脱下防疫服,援鄂人员们原来是这个模样。八位医护人员集体分享支援一线的故事,是他们为后方的我们竖起了最坚实的屏障,感谢这群医护天使的负重前行,致敬!@央视网青年 @雷佳音 @岳云鹏 @演员王迅 @贾乃亮 @努力努力再努力x @Dear-迪丽热巴...全文 赞[364004] 原文转发[1056354] 原文评论[3645]
转发理由:#极限挑战# 感谢你们的守护,最美的逆行者们[心]  
赞[571255] 转发[1000000] 评论[362127] 收藏 05月10日 23:01
#极限挑战# 无奖填词竞答,今晚看👉登峰造_,不可_量,百里_一,南征北_~ Dear-迪丽热巴的微博视频  
赞[731517] 转发[1000000] 评论[1000000] 收藏 05月10日 16:50
转发了 中国青年报VM 的微博:#五四致敬战疫青年# #青春万岁#各地应急响应级别陆续下调,我们正在走向痊愈。回望这些年轻医务人员的脸,不应忘记,正是他们在危难之下,白衣执甲,毅然逆行,为我们筑起血肉长城。感恩提灯天使,致敬最可爱的人!春暖花开,等到疫情完全解除,无论你是从医还是就医,请记住医患之间的休戚与共、唇齿...全文 [组图共12张]
图片 原图 赞[32125] 原文转发[4801627] 原文评论[6975]
转发理由:#五四致敬战疫青年#五四青年节前夕,让我们说一声,#谢谢你保护了我们#!  
赞[721484] 转发[1000000] 评论[597487] 收藏 05月02日 16:40
转发了 东方卫视极限挑战VM 的微博:鸡条君目睹了vivo#极限挑战#第六季首发阵容@雷佳音 @岳云鹏 @演员王迅 @贾乃亮 @努力努力再努力x @Dear-迪丽热巴 @郭京飞 @邓伦 集结的整个过程,这就是欢迎新人的方式[疑问]说好要相亲相爱的呢😂东方卫视极限挑战的微博视频  赞[711006] 原文转发[1409504] 原文评论[14761]
转发理由:#极限挑战#举手之劳,岳岳哥别客气!//@岳云鹏:#极限挑战#谢谢热巴@Dear-迪丽热巴 给我p图,我这里还有好多库存 查看图片  
赞[983375] 转发[1000000] 评论[1000000] 收藏 04月30日 12:30
下页 上页 首页  3/117页
TOP
彩版|触屏|语音
weibo.cn[06-18 23:38]
\ No newline at end of file diff --git a/tests/testdata/e97222acd5bc7d8d1bfbd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html b/tests/testdata/e97222acd5bc7d8d1bfbd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html new file mode 100644 index 00000000..e6ef95e4 --- /dev/null +++ b/tests/testdata/e97222acd5bc7d8d1bfbd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html @@ -0,0 +1 @@ +微博
图片加载中... 1/2 原图
图片加载中... 2/2 原图
\ No newline at end of file diff --git a/tests/testdata/url_map.json b/tests/testdata/url_map.json new file mode 100644 index 00000000..b51dd445 --- /dev/null +++ b/tests/testdata/url_map.json @@ -0,0 +1,10 @@ +{ + "https://weibo.cn/1669879400": "tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html", + "https://weibo.cn/1669879400/info": "tests/testdata/ca5f2a555e8d62f728c66fa90afb2d54d19f8c898e164204a61bdf03.html", + "https://weibo.cn/1669879400?page=1": "tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html", + "https://weibo.cn/mblog/picAll/J6k49kbTc?rl=1": "tests/testdata/e97222acd5bc7d8d1bfbd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html", + "https://weibo.cn/mblog/picAll/J5ZcSnCAg?rl=1": "tests/testdata/63a98849ec82b2c87ec55bca03cbf5988f7eac233a23d86b4fdf5ffd.html", + "https://weibo.cn/1669879400?page=2": "tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html", + "https://weibo.cn/1669879400?page=3": "tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html", + "https://weibo.cn/mblog/picAll/J3xfm61AZ?rl=1": "tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html" +} \ No newline at end of file From 4281c274148a61bcde868a4f1c498180c482c018 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Thu, 18 Jun 2020 23:43:12 +0800 Subject: [PATCH 216/363] tests[#168]: add a test and related test data for page_parser. --- tests/__init__.py | 0 tests/test_page_parser.py | 44 +++++++++++++++++++ ...398d385c377a068b76eb95765f7020ffffd3e.html | 1 + ...74b5537dea736dfb34e48d8835203a45d2e67.html | 1 + ...55bca03cbf5988f7eac233a23d86b4fdf5ffd.html | 1 + ...6f19cfa5d674a610e8b442b1f83de7673ab49.html | 1 + ...ae1595186ac063fe5ec25cf2f98116ece83cb.html | 1 + ...66fa90afb2d54d19f8c898e164204a61bdf03.html | 1 + ...0f2cc77b3648abfa03580b9e0cdb61f1e618f.html | 1 + ...bd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html | 1 + tests/testdata/url_map.json | 10 +++++ 11 files changed, 62 insertions(+) create mode 100644 tests/__init__.py create mode 100644 tests/test_page_parser.py create mode 100644 tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html create mode 100644 tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html create mode 100644 tests/testdata/63a98849ec82b2c87ec55bca03cbf5988f7eac233a23d86b4fdf5ffd.html create mode 100644 tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html create mode 100644 tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html create mode 100644 tests/testdata/ca5f2a555e8d62f728c66fa90afb2d54d19f8c898e164204a61bdf03.html create mode 100644 tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html create mode 100644 tests/testdata/e97222acd5bc7d8d1bfbd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html create mode 100644 tests/testdata/url_map.json diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_page_parser.py b/tests/test_page_parser.py new file mode 100644 index 00000000..8c11cb40 --- /dev/null +++ b/tests/test_page_parser.py @@ -0,0 +1,44 @@ +from unittest.mock import patch, Mock +import json +import os + +from weibo_spider.parser.page_parser import PageParser +from weibo_spider.parser.util import TEST_DATA_DIR, URL_MAP_FILE + + +def mock_request_get_content(url, cookies): + with open(os.path.join(TEST_DATA_DIR, URL_MAP_FILE)) as f: + url_map = json.loads(f.read()) + resp_file = url_map[url] + mock = Mock() + with open(resp_file, "rb") as f: + mock.content = f.read() + return mock + + +@patch('requests.get', mock_request_get_content) +def test_page_parser(): + page_parser = PageParser(cookie="", + user_uri="1669879400", + page=2, + filter=True) + weibos, weibo_id_list = page_parser.get_one_page("2020-06-01", []) + assert (weibo_id_list == ['J4PGk4yMw', 'J4EUStJKu']) + assert (len(weibos) == 2) + assert (str(weibos[0]) == """生日动态 \xa0\n""" + """微博发布位置:无\n""" + """发布时间:2020-06-03 00:00\n""" + """发布工具:生日动态\n""" + """点赞数:1499637\n""" + """转发数:1000000\n""" + """评论数:1000000\n""" + """url:https://weibo.cn/comment/J4PGk4yMw\n""") + assert (str(weibos[1]) == + """#微博剧场# #周放设计淡黄的长裙# 这是一幅有声音的手稿#幸福触手可及# 绿洲 \xa0原图\xa0\n""" + """微博发布位置:无\n""" + """发布时间:2020-06-01 20:35\n""" + """发布工具:绿洲APP\n""" + """点赞数:419172\n""" + """转发数:1000000\n""" + """评论数:1000000\n""" + """url:https://weibo.cn/comment/J4EUStJKu\n""") diff --git a/tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html b/tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html new file mode 100644 index 00000000..9f980e99 --- /dev/null +++ b/tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html @@ -0,0 +1 @@ +Dear-迪丽热巴的微博
Dear-迪丽热巴的微博 加关注
 微博  相册 
生日动态  
赞[1499637] 转发[1000000] 评论[1000000] 收藏 06月03日 00:00 来自生日动态
炎炎夏日让每天的沐浴时光都变得尤其重要,精致的沙龙香相伴让沐浴也可以成为清新浪漫的享受!给大家@LUX力士 的沐浴小秘密分享,有力士植萃沐浴露,把沐浴变成“仪式感”!我的心选好物分享给你们啦 [笑而不语] LUX力士的微博视频  
赞[377571] 转发[1000000] 评论[1000000] 收藏 05月31日 10:59
#idoltube##周放vlog# 第二篇来啦!今天邀请大家走进生活,走进幸福的放放子一家~[喵喵]#幸福触手可及# Dear-迪丽热巴的微博视频  
赞[397951] 转发[1000000] 评论[1000000] 收藏 05月30日 19:02 来自国产剧集 · 视频社区
@法国娇韵诗 收到宠爱了~小娇的618#娇宠你有一套#,早晚护肤都靠它,超级喜欢这份宠爱!现在给全体爱丽丝们施法,希望你们都可以拥有这份让你变美的娇宠礼物哦~同款娇宠http://t.cn/A62cgDJp一起享用!  [组图共2张]
#微博剧场# 我为4A景区代言,酷飒周放的追剧邀请,你来吗? #4A景区触手可及#
@路易威登 PONT 9 手袋 陪你摩登一夏[嘻嘻]#LVPONT9#  [组图共3张]
#热巴手稿填色大赛#服装手稿填色游戏正式开启!图一出自迪迪子,图二出自放放子。迪迪子的面子就靠大家的后期填色了[微笑] 绿洲  [组图共2张]
图片 原图 
赞[733669] 转发[1000000] 评论[1000000] 收藏 05月27日 14:48 来自绿洲APP
转发了 护舒宝VM 的微博:还记得和宝宝陪着@Dear-迪丽热巴 走过的花路吗?谢谢阿丝们一直以来的陪伴[太开心][太开心]~为你甄选护舒宝天然纯棉卫生巾,给你透气亲肤的体验。现在上天猫超市购买,1套减25,第2套只要19.9。未来的花路,和宝宝一起用好物,守护热巴!#迪丽热巴[超话]#
图片 原图 赞[43521] 原文转发[1000000] 原文评论[13967]
转发理由:谢谢@护舒宝 和阿丝们的守护,每一刻都非常有意义。未来请继续指教啦~  
赞[418833] 转发[1000000] 评论[1000000] 收藏 05月26日 11:14
#idoltube##周放vlog# 放放子的第一支搞事业篇vlog已上线~约vlog的朋友们可以放下你们的号码牌了[可爱] #幸福触手可及# Dear-迪丽热巴的微博视频  
赞[450537] 转发[1000000] 评论[216934] 收藏 05月25日 20:53 来自影视剪辑 · 视频社区
下页 上页 首页  2/117页
TOP
彩版|触屏|语音
weibo.cn[06-18 23:38]
\ No newline at end of file diff --git a/tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html b/tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html new file mode 100644 index 00000000..858a8ee4 --- /dev/null +++ b/tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html @@ -0,0 +1 @@ +Dear-迪丽热巴的微博
头像
Dear-迪丽热巴VM 女/上海   加关注
认证:嘉行传媒签约演员 
一只喜欢默默表演的小透明。工作联系jaywalk@jaywalk.com.cn ...
私信 资料 操作 特别关注 送Ta会员
 微博  相册 
#一起热爱就现在#给你们康康我眼前的画面[嘻嘻] 绿洲
图片 原图 
赞[508797] 转发[1000000] 评论[391666] 收藏 06月17日 18:12 来自绿洲APP
刚收到我定制的亓那眼镜,猜猜定制了什么[doge]好奇?没关系,你们也可以拥有自己的定制眼镜。关注@QINA亓那眼镜 解锁6月限定惊喜,#时髦寻宝计划# 线上线下都安排了[偷笑]QINA亓那眼镜的微博视频  
赞[414547] 转发[1000000] 评论[1000000] 收藏 06月15日 10:09
#idoltube##周放vlog# 什么?放放子还有两副面孔呢?[喵喵] #幸福触手可及# Dear-迪丽热巴的微博视频  
赞[317796] 转发[1000000] 评论[514532] 收藏 06月14日 20:09 来自影视剪辑 · 视频社区
图片 原图 
赞[1150003] 转发[1000000] 评论[1000000] 收藏 06月12日 19:11 来自绿洲APP
言出必行,说了18张就是18张,送给七千万的你们 ~  [组图共18张]
放放子缺个快板[偷笑] 绿洲
图片 原图 
赞[571684] 转发[1000000] 评论[1000000] 收藏 06月08日 15:17 来自绿洲APP
转发了 WCS野生生物保护学会V 的微博:去年和亲善大使热巴@Dear-迪丽热巴 的特别回忆[心]。我们在藏北羌塘一起爬山,探访藏羚羊、雪豹、黑颈鹤的栖息地,感受野生动物保护工作的点滴。此时此刻,我们比以往更加重视与自然相处的方式,我们也从未如此迫切需要将想法付诸行动。热巴已经和我们@北京绿色阳光 站在一起,希望看完视频的你们,也...全文 赞[119295] 原文转发[1000000] 原文评论[38687]
转发理由:在羌塘的美好回忆~第一次来到这片独特的荒野,看到野生动物自由生活,还有一群快乐可爱的人在守护着它们。把这些美好留存下来,关注野生动物保护,积极行动,我们每个人都能贡献力量。[心]  
赞[554383] 转发[1000000] 评论[1000000] 收藏 06月05日 11:11
要开心。要充实。
#微博live秀# 28岁的直播~@Dear-迪丽热巴 的一直播(下载App->http://t.cn/RDUuslr 
赞[435625] 转发[23583] 评论[1000000] 收藏 06月03日 19:00 来自一直播Yi
下页  1/117页
TOP
彩版|触屏|语音
weibo.cn[06-18 23:37]
\ No newline at end of file diff --git a/tests/testdata/63a98849ec82b2c87ec55bca03cbf5988f7eac233a23d86b4fdf5ffd.html b/tests/testdata/63a98849ec82b2c87ec55bca03cbf5988f7eac233a23d86b4fdf5ffd.html new file mode 100644 index 00000000..ea9bdf4c --- /dev/null +++ b/tests/testdata/63a98849ec82b2c87ec55bca03cbf5988f7eac233a23d86b4fdf5ffd.html @@ -0,0 +1 @@ +微博
图片加载中... 1/18 原图
图片加载中... 2/18 原图
图片加载中... 3/18 原图
图片加载中... 4/18 原图
图片加载中... 5/18 原图
图片加载中... 6/18 原图
图片加载中... 7/18 原图
图片加载中... 8/18 原图
图片加载中... 9/18 原图
图片加载中... 10/18 原图
图片加载中... 11/18 原图
图片加载中... 12/18 原图
图片加载中... 13/18 原图
图片加载中... 14/18 原图
图片加载中... 15/18 原图
图片加载中... 16/18 原图
图片加载中... 17/18 原图
图片加载中... 18/18 原图
\ No newline at end of file diff --git a/tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html b/tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html new file mode 100644 index 00000000..9e6330f9 --- /dev/null +++ b/tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html @@ -0,0 +1 @@ +微博
图片加载中... 1/2 原图
图片加载中... 2/2 原图
\ No newline at end of file diff --git a/tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html b/tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html new file mode 100644 index 00000000..280d824a --- /dev/null +++ b/tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html @@ -0,0 +1 @@ +Dear-迪丽热巴的微博
头像
Dear-迪丽热巴VM 女/上海   加关注
认证:嘉行传媒签约演员 
一只喜欢默默表演的小透明。工作联系jaywalk@jaywalk.com.cn ...
私信 资料 操作 特别关注 送Ta会员
 微博  相册 
#一起热爱就现在#给你们康康我眼前的画面[嘻嘻] 绿洲
图片 原图 
赞[508798] 转发[1000000] 评论[391666] 收藏 06月17日 18:12 来自绿洲APP
刚收到我定制的亓那眼镜,猜猜定制了什么[doge]好奇?没关系,你们也可以拥有自己的定制眼镜。关注@QINA亓那眼镜 解锁6月限定惊喜,#时髦寻宝计划# 线上线下都安排了[偷笑]QINA亓那眼镜的微博视频  
赞[414547] 转发[1000000] 评论[1000000] 收藏 06月15日 10:09
#idoltube##周放vlog# 什么?放放子还有两副面孔呢?[喵喵] #幸福触手可及# Dear-迪丽热巴的微博视频  
赞[317796] 转发[1000000] 评论[514532] 收藏 06月14日 20:09 来自影视剪辑 · 视频社区
图片 原图 
赞[1150003] 转发[1000000] 评论[1000000] 收藏 06月12日 19:11 来自绿洲APP
言出必行,说了18张就是18张,送给七千万的你们 ~  [组图共18张]
放放子缺个快板[偷笑] 绿洲
图片 原图 
赞[571684] 转发[1000000] 评论[1000000] 收藏 06月08日 15:17 来自绿洲APP
转发了 WCS野生生物保护学会V 的微博:去年和亲善大使热巴@Dear-迪丽热巴 的特别回忆[心]。我们在藏北羌塘一起爬山,探访藏羚羊、雪豹、黑颈鹤的栖息地,感受野生动物保护工作的点滴。此时此刻,我们比以往更加重视与自然相处的方式,我们也从未如此迫切需要将想法付诸行动。热巴已经和我们@北京绿色阳光 站在一起,希望看完视频的你们,也...全文 赞[119295] 原文转发[1000000] 原文评论[38687]
转发理由:在羌塘的美好回忆~第一次来到这片独特的荒野,看到野生动物自由生活,还有一群快乐可爱的人在守护着它们。把这些美好留存下来,关注野生动物保护,积极行动,我们每个人都能贡献力量。[心]  
赞[554383] 转发[1000000] 评论[1000000] 收藏 06月05日 11:11
要开心。要充实。
#微博live秀# 28岁的直播~@Dear-迪丽热巴 的一直播(下载App->http://t.cn/RDUuslr 
赞[435625] 转发[23583] 评论[1000000] 收藏 06月03日 19:00 来自一直播Yi
下页  1/117页
TOP
彩版|触屏|语音
weibo.cn[06-18 23:37]
\ No newline at end of file diff --git a/tests/testdata/ca5f2a555e8d62f728c66fa90afb2d54d19f8c898e164204a61bdf03.html b/tests/testdata/ca5f2a555e8d62f728c66fa90afb2d54d19f8c898e164204a61bdf03.html new file mode 100644 index 00000000..2990dbe4 --- /dev/null +++ b/tests/testdata/ca5f2a555e8d62f728c66fa90afb2d54d19f8c898e164204a61bdf03.html @@ -0,0 +1 @@ +Dear-迪丽热巴的资料
头像
会员等级:7级 送Ta会员
微身份 语惊四座 七步成诗 谈笑风生 更多勋章
基本信息
昵称:Dear-迪丽热巴
认证:嘉行传媒签约演员 
性别:女
地区:上海
生日:双子座
认证信息:嘉行传媒签约演员 
简介:一只喜欢默默表演的小透明。工作联系jaywalk@jaywalk.com.cn 🍒
学习经历
·上海戏剧学院
工作经历
·嘉行传媒 
其他信息
互联网:http://weibo.com/u/1669879400
手机版:https://weibo.cn/u/1669879400
她的相册>>
TOP
彩版|触屏|语音
weibo.cn[06-18 23:37]
\ No newline at end of file diff --git a/tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html b/tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html new file mode 100644 index 00000000..bfb31fe9 --- /dev/null +++ b/tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html @@ -0,0 +1 @@ +Dear-迪丽热巴的微博
Dear-迪丽热巴的微博 加关注
 微博  相册 
粉色天空、闪耀夜色、浪漫爱意…我把我喜爱的元素和巴黎限定记忆全部定格在这一瓶#YSL反转巴黎#热爱限定中,第一次与YSL一起合作设计香水,在这#拦不住的夏天#把甜甜的曼陀罗花香送给你们,喜欢吗?💓  [组图共2张]
#幸福触手可及开播##幸福触手可及# 度量自身,方能修炼精彩人生。追梦不易,披荆斩棘。今晚八点@湖南卫视 和周放,一起守护梦想,书写初夏。
很高兴成为力士大中华区沐浴系列代言人,520就要到啦,大家快来接收告白福利哦!全新植萃泡泡沐浴露让每一位小仙女都能在浓密泡泡浴中拥有夏日嫩白肌,仙气香气都十足!关注@LUX力士 第一时间锁定新品哦!LUX力士的微博视频  
赞[741400] 转发[1000000] 评论[1000000] 收藏 05月19日 09:03
转发了 电视剧幸福触手可及VM 的微博:#幸福触手可及##幸福触手可及定档0519# 从没有一个时刻,幸福如此靠近,只因有你在身边[心]5月19日20:00锁定@湖南卫视 金鹰独播剧场,@优酷 @爱奇艺 @腾讯视频 24点同步更新,等你解锁初夏甜梦!
图片 原图 赞[129675] 原文转发[332651] 原文评论[6900]
转发理由:#幸福触手可及定档0519# 唯有热爱,不负韶华,为之全力以赴,才能成为更优秀的人。5月19日20:00锁定湖南卫视#幸福触手可及# ,愈挫愈勇的独立设计师周放来啦。  
赞[480417] 转发[57012] 评论[40966] 收藏 05月15日 20:23
哈哈哈哈哈哈👅
转发了 北京2022年冬奥会VM 的微博:【爱豆喊你来助力#北京2022#
花样滑冰,旋转跳跃 ,“迪丽”前行 @Dear-迪丽热巴 北京2022年冬奥会的微博视频
 赞[680201] 原文转发[1450782] 原文评论[50694]
转发理由:与我一起,关注花样滑冰,为中国健儿鼓劲加油[加油]  
赞[501777] 转发[1000000] 评论[1000000] 收藏 05月15日 10:20
转发了 央视网VM 的微博:【想看看战疫一线医护人员们的脸!#极限挑战致敬医护人员#】脱下防疫服,援鄂人员们原来是这个模样。八位医护人员集体分享支援一线的故事,是他们为后方的我们竖起了最坚实的屏障,感谢这群医护天使的负重前行,致敬!@央视网青年 @雷佳音 @岳云鹏 @演员王迅 @贾乃亮 @努力努力再努力x @Dear-迪丽热巴...全文 赞[364004] 原文转发[1056354] 原文评论[3645]
转发理由:#极限挑战# 感谢你们的守护,最美的逆行者们[心]  
赞[571255] 转发[1000000] 评论[362127] 收藏 05月10日 23:01
#极限挑战# 无奖填词竞答,今晚看👉登峰造_,不可_量,百里_一,南征北_~ Dear-迪丽热巴的微博视频  
赞[731517] 转发[1000000] 评论[1000000] 收藏 05月10日 16:50
转发了 中国青年报VM 的微博:#五四致敬战疫青年# #青春万岁#各地应急响应级别陆续下调,我们正在走向痊愈。回望这些年轻医务人员的脸,不应忘记,正是他们在危难之下,白衣执甲,毅然逆行,为我们筑起血肉长城。感恩提灯天使,致敬最可爱的人!春暖花开,等到疫情完全解除,无论你是从医还是就医,请记住医患之间的休戚与共、唇齿...全文 [组图共12张]
图片 原图 赞[32125] 原文转发[4801627] 原文评论[6975]
转发理由:#五四致敬战疫青年#五四青年节前夕,让我们说一声,#谢谢你保护了我们#!  
赞[721484] 转发[1000000] 评论[597487] 收藏 05月02日 16:40
转发了 东方卫视极限挑战VM 的微博:鸡条君目睹了vivo#极限挑战#第六季首发阵容@雷佳音 @岳云鹏 @演员王迅 @贾乃亮 @努力努力再努力x @Dear-迪丽热巴 @郭京飞 @邓伦 集结的整个过程,这就是欢迎新人的方式[疑问]说好要相亲相爱的呢😂东方卫视极限挑战的微博视频  赞[711006] 原文转发[1409504] 原文评论[14761]
转发理由:#极限挑战#举手之劳,岳岳哥别客气!//@岳云鹏:#极限挑战#谢谢热巴@Dear-迪丽热巴 给我p图,我这里还有好多库存 查看图片  
赞[983375] 转发[1000000] 评论[1000000] 收藏 04月30日 12:30
下页 上页 首页  3/117页
TOP
彩版|触屏|语音
weibo.cn[06-18 23:38]
\ No newline at end of file diff --git a/tests/testdata/e97222acd5bc7d8d1bfbd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html b/tests/testdata/e97222acd5bc7d8d1bfbd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html new file mode 100644 index 00000000..e6ef95e4 --- /dev/null +++ b/tests/testdata/e97222acd5bc7d8d1bfbd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html @@ -0,0 +1 @@ +微博
图片加载中... 1/2 原图
图片加载中... 2/2 原图
\ No newline at end of file diff --git a/tests/testdata/url_map.json b/tests/testdata/url_map.json new file mode 100644 index 00000000..b51dd445 --- /dev/null +++ b/tests/testdata/url_map.json @@ -0,0 +1,10 @@ +{ + "https://weibo.cn/1669879400": "tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html", + "https://weibo.cn/1669879400/info": "tests/testdata/ca5f2a555e8d62f728c66fa90afb2d54d19f8c898e164204a61bdf03.html", + "https://weibo.cn/1669879400?page=1": "tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html", + "https://weibo.cn/mblog/picAll/J6k49kbTc?rl=1": "tests/testdata/e97222acd5bc7d8d1bfbd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html", + "https://weibo.cn/mblog/picAll/J5ZcSnCAg?rl=1": "tests/testdata/63a98849ec82b2c87ec55bca03cbf5988f7eac233a23d86b4fdf5ffd.html", + "https://weibo.cn/1669879400?page=2": "tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html", + "https://weibo.cn/1669879400?page=3": "tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html", + "https://weibo.cn/mblog/picAll/J3xfm61AZ?rl=1": "tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html" +} \ No newline at end of file From c2719e3d359762f7ee3dddc3396dc32b62b944c8 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Fri, 19 Jun 2020 00:49:15 +0800 Subject: [PATCH 217/363] fix: bug fix for comment_parser. --- weibo_spider/parser/comment_parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/parser/comment_parser.py index 53c2ab59..ea28acb7 100644 --- a/weibo_spider/parser/comment_parser.py +++ b/weibo_spider/parser/comment_parser.py @@ -3,7 +3,7 @@ from time import sleep from .parser import Parser -from .util import handle_html +from .util import handle_html, handle_garbled class CommentParser(Parser): @@ -16,10 +16,10 @@ def get_long_weibo(self): """获取长原创微博""" try: for i in range(5): - self.selector = self.handle_html(self.cookie, self.url) + self.selector = handle_html(self.cookie, self.url) if self.selector is not None: info = self.selector.xpath("//div[@class='c']")[1] - wb_content = self.handle_garbled(info) + wb_content = handle_garbled(info) wb_time = info.xpath("//span[@class='ct']/text()")[0] weibo_content = wb_content[wb_content.find(":") + 1:wb_content.rfind(wb_time)] From 666e7b7a521c13a853b88885392225a7de54d21c Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Fri, 19 Jun 2020 00:49:36 +0800 Subject: [PATCH 218/363] tests: update test data and add test for comment_parser. --- tests/test_comment_parser.py | 26 +++++++++++++++++++ tests/test_page_parser.py | 20 +++----------- ...398d385c377a068b76eb95765f7020ffffd3e.html | 2 +- ...74b5537dea736dfb34e48d8835203a45d2e67.html | 2 +- ...edd544dbc0ab5e86d43e103405f0c60515884.html | 1 + ...6f19cfa5d674a610e8b442b1f83de7673ab49.html | 2 +- ...ae1595186ac063fe5ec25cf2f98116ece83cb.html | 2 +- ...66fa90afb2d54d19f8c898e164204a61bdf03.html | 2 +- ...0f2cc77b3648abfa03580b9e0cdb61f1e618f.html | 2 +- ...bd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html | 2 +- tests/testdata/url_map.json | 3 ++- tests/util.py | 15 +++++++++++ 12 files changed, 55 insertions(+), 24 deletions(-) create mode 100644 tests/test_comment_parser.py create mode 100644 tests/testdata/4d5ed0a3ebd0303cb45edd544dbc0ab5e86d43e103405f0c60515884.html create mode 100644 tests/util.py diff --git a/tests/test_comment_parser.py b/tests/test_comment_parser.py new file mode 100644 index 00000000..51c79374 --- /dev/null +++ b/tests/test_comment_parser.py @@ -0,0 +1,26 @@ +from unittest.mock import patch + +from .util import mock_request_get_content +from weibo_spider.parser.comment_parser import CommentParser + + +@patch('requests.get', mock_request_get_content) +def test_comment_parser(): + comment_parser = CommentParser(cookie="", weibo_id="J5cVGuUNq") + long_weibo = comment_parser.get_long_weibo() + long_retweet = comment_parser.get_long_retweet() + assert ( + long_retweet == """去年和亲善大使热巴@Dear-迪丽热巴 的特别回忆。""" + """我们在藏北羌塘一起爬山,探访藏羚羊、雪豹、黑颈鹤的栖息地,感受野生动物保护工作的点滴。""" + """此时此刻,我们比以往更加重视与自然相处的方式,我们也从未如此迫切需要将想法付诸行动。""" + """热巴已经和我们@北京绿色阳光 站在一起,希望看完视频的你们,也能获得同样感受与动力。""" + """We Stand for Wildlife. 明日朝阳68309的优酷视频 \xa0""") + assert ( + long_weibo == """去年和亲善大使热巴@Dear-迪丽热巴 的特别回忆。""" + """我们在藏北羌塘一起爬山,探访藏羚羊、雪豹、黑颈鹤的栖息地,感受野生动物保护工作的点滴。""" + """此时此刻,我们比以往更加重视与自然相处的方式,我们也从未如此迫切需要将想法付诸行动。""" + """热巴已经和我们@北京绿色阳光 站在一起,希望看完视频的你们,也能获得同样感受与动力。""" + """We Stand for Wildlife. 明日朝阳68309的优酷视频 \xa0""" + """原文转发[1000000] \xa0原文评论[38688] 转发理由: 在羌塘的美好回忆~""" + """第一次来到这片独特的荒野,看到野生动物自由生活,还有一群快乐可爱的人在守护着它们。""" + """把这些美好留存下来,关注野生动物保护,积极行动,我们每个人都能贡献力量。 \xa0 """) diff --git a/tests/test_page_parser.py b/tests/test_page_parser.py index 8c11cb40..a6b2b211 100644 --- a/tests/test_page_parser.py +++ b/tests/test_page_parser.py @@ -1,19 +1,7 @@ -from unittest.mock import patch, Mock -import json -import os +from unittest.mock import patch +from .util import mock_request_get_content from weibo_spider.parser.page_parser import PageParser -from weibo_spider.parser.util import TEST_DATA_DIR, URL_MAP_FILE - - -def mock_request_get_content(url, cookies): - with open(os.path.join(TEST_DATA_DIR, URL_MAP_FILE)) as f: - url_map = json.loads(f.read()) - resp_file = url_map[url] - mock = Mock() - with open(resp_file, "rb") as f: - mock.content = f.read() - return mock @patch('requests.get', mock_request_get_content) @@ -29,7 +17,7 @@ def test_page_parser(): """微博发布位置:无\n""" """发布时间:2020-06-03 00:00\n""" """发布工具:生日动态\n""" - """点赞数:1499637\n""" + """点赞数:1499675\n""" """转发数:1000000\n""" """评论数:1000000\n""" """url:https://weibo.cn/comment/J4PGk4yMw\n""") @@ -38,7 +26,7 @@ def test_page_parser(): """微博发布位置:无\n""" """发布时间:2020-06-01 20:35\n""" """发布工具:绿洲APP\n""" - """点赞数:419172\n""" + """点赞数:419181\n""" """转发数:1000000\n""" """评论数:1000000\n""" """url:https://weibo.cn/comment/J4EUStJKu\n""") diff --git a/tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html b/tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html index 9f980e99..1147fda2 100644 --- a/tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html +++ b/tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html @@ -1 +1 @@ -Dear-迪丽热巴的微博
Dear-迪丽热巴的微博 加关注
 微博  相册 
生日动态  
赞[1499637] 转发[1000000] 评论[1000000] 收藏 06月03日 00:00 来自生日动态
炎炎夏日让每天的沐浴时光都变得尤其重要,精致的沙龙香相伴让沐浴也可以成为清新浪漫的享受!给大家@LUX力士 的沐浴小秘密分享,有力士植萃沐浴露,把沐浴变成“仪式感”!我的心选好物分享给你们啦 [笑而不语] LUX力士的微博视频  
赞[377571] 转发[1000000] 评论[1000000] 收藏 05月31日 10:59
#idoltube##周放vlog# 第二篇来啦!今天邀请大家走进生活,走进幸福的放放子一家~[喵喵]#幸福触手可及# Dear-迪丽热巴的微博视频  
赞[397951] 转发[1000000] 评论[1000000] 收藏 05月30日 19:02 来自国产剧集 · 视频社区
@法国娇韵诗 收到宠爱了~小娇的618#娇宠你有一套#,早晚护肤都靠它,超级喜欢这份宠爱!现在给全体爱丽丝们施法,希望你们都可以拥有这份让你变美的娇宠礼物哦~同款娇宠http://t.cn/A62cgDJp一起享用!  [组图共2张]
#微博剧场# 我为4A景区代言,酷飒周放的追剧邀请,你来吗? #4A景区触手可及#
@路易威登 PONT 9 手袋 陪你摩登一夏[嘻嘻]#LVPONT9#  [组图共3张]
#热巴手稿填色大赛#服装手稿填色游戏正式开启!图一出自迪迪子,图二出自放放子。迪迪子的面子就靠大家的后期填色了[微笑] 绿洲  [组图共2张]
图片 原图 
赞[733669] 转发[1000000] 评论[1000000] 收藏 05月27日 14:48 来自绿洲APP
转发了 护舒宝VM 的微博:还记得和宝宝陪着@Dear-迪丽热巴 走过的花路吗?谢谢阿丝们一直以来的陪伴[太开心][太开心]~为你甄选护舒宝天然纯棉卫生巾,给你透气亲肤的体验。现在上天猫超市购买,1套减25,第2套只要19.9。未来的花路,和宝宝一起用好物,守护热巴!#迪丽热巴[超话]#
图片 原图 赞[43521] 原文转发[1000000] 原文评论[13967]
转发理由:谢谢@护舒宝 和阿丝们的守护,每一刻都非常有意义。未来请继续指教啦~  
赞[418833] 转发[1000000] 评论[1000000] 收藏 05月26日 11:14
#idoltube##周放vlog# 放放子的第一支搞事业篇vlog已上线~约vlog的朋友们可以放下你们的号码牌了[可爱] #幸福触手可及# Dear-迪丽热巴的微博视频  
赞[450537] 转发[1000000] 评论[216934] 收藏 05月25日 20:53 来自影视剪辑 · 视频社区
下页 上页 首页  2/117页
TOP
彩版|触屏|语音
weibo.cn[06-18 23:38]
\ No newline at end of file +Dear-迪丽热巴的微博
Dear-迪丽热巴的微博 加关注
 微博  相册 
生日动态  
赞[1499675] 转发[1000000] 评论[1000000] 收藏 06月03日 00:00 来自生日动态
炎炎夏日让每天的沐浴时光都变得尤其重要,精致的沙龙香相伴让沐浴也可以成为清新浪漫的享受!给大家@LUX力士 的沐浴小秘密分享,有力士植萃沐浴露,把沐浴变成“仪式感”!我的心选好物分享给你们啦 [笑而不语] LUX力士的微博视频  
赞[377578] 转发[1000000] 评论[1000000] 收藏 05月31日 10:59
#idoltube##周放vlog# 第二篇来啦!今天邀请大家走进生活,走进幸福的放放子一家~[喵喵]#幸福触手可及# Dear-迪丽热巴的微博视频  
赞[397970] 转发[1000000] 评论[1000000] 收藏 05月30日 19:02 来自国产剧集 · 视频社区
@法国娇韵诗 收到宠爱了~小娇的618#娇宠你有一套#,早晚护肤都靠它,超级喜欢这份宠爱!现在给全体爱丽丝们施法,希望你们都可以拥有这份让你变美的娇宠礼物哦~同款娇宠http://t.cn/A62cgDJp一起享用!  [组图共2张]
#微博剧场# 我为4A景区代言,酷飒周放的追剧邀请,你来吗? #4A景区触手可及#
@路易威登 PONT 9 手袋 陪你摩登一夏[嘻嘻]#LVPONT9#  [组图共3张]
#热巴手稿填色大赛#服装手稿填色游戏正式开启!图一出自迪迪子,图二出自放放子。迪迪子的面子就靠大家的后期填色了[微笑] 绿洲  [组图共2张]
图片 原图 
赞[733671] 转发[1000000] 评论[1000000] 收藏 05月27日 14:48 来自绿洲APP
转发了 护舒宝VM 的微博:还记得和宝宝陪着@Dear-迪丽热巴 走过的花路吗?谢谢阿丝们一直以来的陪伴[太开心][太开心]~为你甄选护舒宝天然纯棉卫生巾,给你透气亲肤的体验。现在上天猫超市购买,1套减25,第2套只要19.9。未来的花路,和宝宝一起用好物,守护热巴!#迪丽热巴[超话]#
图片 原图 赞[43521] 原文转发[1000000] 原文评论[13967]
转发理由:谢谢@护舒宝 和阿丝们的守护,每一刻都非常有意义。未来请继续指教啦~  
赞[418834] 转发[1000000] 评论[1000000] 收藏 05月26日 11:14
#idoltube##周放vlog# 放放子的第一支搞事业篇vlog已上线~约vlog的朋友们可以放下你们的号码牌了[可爱] #幸福触手可及# Dear-迪丽热巴的微博视频  
赞[450541] 转发[1000000] 评论[216934] 收藏 05月25日 20:53 来自影视剪辑 · 视频社区
下页 上页 首页  2/117页
TOP
彩版|触屏|语音
weibo.cn[06-19 00:47]
\ No newline at end of file diff --git a/tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html b/tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html index 858a8ee4..92824e90 100644 --- a/tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html +++ b/tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html @@ -1 +1 @@ -Dear-迪丽热巴的微博
头像
Dear-迪丽热巴VM 女/上海   加关注
认证:嘉行传媒签约演员 
一只喜欢默默表演的小透明。工作联系jaywalk@jaywalk.com.cn ...
私信 资料 操作 特别关注 送Ta会员
 微博  相册 
#一起热爱就现在#给你们康康我眼前的画面[嘻嘻] 绿洲
图片 原图 
赞[508797] 转发[1000000] 评论[391666] 收藏 06月17日 18:12 来自绿洲APP
刚收到我定制的亓那眼镜,猜猜定制了什么[doge]好奇?没关系,你们也可以拥有自己的定制眼镜。关注@QINA亓那眼镜 解锁6月限定惊喜,#时髦寻宝计划# 线上线下都安排了[偷笑]QINA亓那眼镜的微博视频  
赞[414547] 转发[1000000] 评论[1000000] 收藏 06月15日 10:09
#idoltube##周放vlog# 什么?放放子还有两副面孔呢?[喵喵] #幸福触手可及# Dear-迪丽热巴的微博视频  
赞[317796] 转发[1000000] 评论[514532] 收藏 06月14日 20:09 来自影视剪辑 · 视频社区
图片 原图 
赞[1150003] 转发[1000000] 评论[1000000] 收藏 06月12日 19:11 来自绿洲APP
言出必行,说了18张就是18张,送给七千万的你们 ~  [组图共18张]
放放子缺个快板[偷笑] 绿洲
图片 原图 
赞[571684] 转发[1000000] 评论[1000000] 收藏 06月08日 15:17 来自绿洲APP
转发了 WCS野生生物保护学会V 的微博:去年和亲善大使热巴@Dear-迪丽热巴 的特别回忆[心]。我们在藏北羌塘一起爬山,探访藏羚羊、雪豹、黑颈鹤的栖息地,感受野生动物保护工作的点滴。此时此刻,我们比以往更加重视与自然相处的方式,我们也从未如此迫切需要将想法付诸行动。热巴已经和我们@北京绿色阳光 站在一起,希望看完视频的你们,也...全文 赞[119295] 原文转发[1000000] 原文评论[38687]
转发理由:在羌塘的美好回忆~第一次来到这片独特的荒野,看到野生动物自由生活,还有一群快乐可爱的人在守护着它们。把这些美好留存下来,关注野生动物保护,积极行动,我们每个人都能贡献力量。[心]  
赞[554383] 转发[1000000] 评论[1000000] 收藏 06月05日 11:11
要开心。要充实。
#微博live秀# 28岁的直播~@Dear-迪丽热巴 的一直播(下载App->http://t.cn/RDUuslr 
赞[435625] 转发[23583] 评论[1000000] 收藏 06月03日 19:00 来自一直播Yi
下页  1/117页
TOP
彩版|触屏|语音
weibo.cn[06-18 23:37]
\ No newline at end of file +Dear-迪丽热巴的微博
头像
Dear-迪丽热巴VM 女/上海   加关注
认证:嘉行传媒签约演员 
一只喜欢默默表演的小透明。工作联系jaywalk@jaywalk.com.cn ...
私信 资料 操作 特别关注 送Ta会员
 微博  相册 
#一起热爱就现在#给你们康康我眼前的画面[嘻嘻] 绿洲
图片 原图 
赞[523801] 转发[1000000] 评论[393530] 收藏 06月17日 18:12 来自绿洲APP
刚收到我定制的亓那眼镜,猜猜定制了什么[doge]好奇?没关系,你们也可以拥有自己的定制眼镜。关注@QINA亓那眼镜 解锁6月限定惊喜,#时髦寻宝计划# 线上线下都安排了[偷笑]QINA亓那眼镜的微博视频  
赞[415899] 转发[1000000] 评论[1000000] 收藏 06月15日 10:09
#idoltube##周放vlog# 什么?放放子还有两副面孔呢?[喵喵] #幸福触手可及# Dear-迪丽热巴的微博视频  
赞[318054] 转发[1000000] 评论[514546] 收藏 06月14日 20:09 来自影视剪辑 · 视频社区
图片 原图 
赞[1150265] 转发[1000000] 评论[1000000] 收藏 06月12日 19:11 来自绿洲APP
言出必行,说了18张就是18张,送给七千万的你们 ~  [组图共18张]
放放子缺个快板[偷笑] 绿洲
图片 原图 
赞[571755] 转发[1000000] 评论[1000000] 收藏 06月08日 15:17 来自绿洲APP
转发了 WCS野生生物保护学会V 的微博:去年和亲善大使热巴@Dear-迪丽热巴 的特别回忆[心]。我们在藏北羌塘一起爬山,探访藏羚羊、雪豹、黑颈鹤的栖息地,感受野生动物保护工作的点滴。此时此刻,我们比以往更加重视与自然相处的方式,我们也从未如此迫切需要将想法付诸行动。热巴已经和我们@北京绿色阳光 站在一起,希望看完视频的你们,也...全文 赞[119296] 原文转发[1000000] 原文评论[38688]
转发理由:在羌塘的美好回忆~第一次来到这片独特的荒野,看到野生动物自由生活,还有一群快乐可爱的人在守护着它们。把这些美好留存下来,关注野生动物保护,积极行动,我们每个人都能贡献力量。[心]  
赞[554415] 转发[1000000] 评论[1000000] 收藏 06月05日 11:11
要开心。要充实。
#微博live秀# 28岁的直播~@Dear-迪丽热巴 的一直播(下载App->http://t.cn/RDUuslr 
赞[435650] 转发[23584] 评论[1000000] 收藏 06月03日 19:00 来自一直播Yi
下页  1/117页
TOP
彩版|触屏|语音
weibo.cn[06-19 00:47]
\ No newline at end of file diff --git a/tests/testdata/4d5ed0a3ebd0303cb45edd544dbc0ab5e86d43e103405f0c60515884.html b/tests/testdata/4d5ed0a3ebd0303cb45edd544dbc0ab5e86d43e103405f0c60515884.html new file mode 100644 index 00000000..9cb503c5 --- /dev/null +++ b/tests/testdata/4d5ed0a3ebd0303cb45edd544dbc0ab5e86d43e103405f0c60515884.html @@ -0,0 +1 @@ +评论列表
Dear-迪丽热巴VM  转发了 @WCS野生生物保护学会V 的微博:去年和亲善大使热巴@Dear-迪丽热巴 的特别回忆[心]。我们在藏北羌塘一起爬山,探访藏羚羊、雪豹、黑颈鹤的栖息地,感受野生动物保护工作的点滴。此时此刻,我们比以往更加重视与自然相处的方式,我们也从未如此迫切需要将想法付诸行动。热巴已经和我们@北京绿色阳光 站在一起,希望看完视频的你们,也能获得同样感受与动力。

We Stand for Wildlife.

明日朝阳68309的优酷视频  原文转发[1000000]  原文评论[38688]
转发理由: 在羌塘的美好回忆~第一次来到这片独特的荒野,看到野生动物自由生活,还有一群快乐可爱的人在守护着它们。把这些美好留存下来,关注野生动物保护,积极行动,我们每个人都能贡献力量。[心]   06月05日 11:11  关注她  举报 收藏 操作
 转发[1000000]  评论[1000000]  赞[554415] 
评论只显示前140字:

 
[热门]LUX力士VM:大家和迪迪一起保护动物[心] 举报 赞[18506] 回复 06月05日 11:12 来自网页
[热门]护舒宝VM:跟迪迪一起好好保护野生动物[心] 举报 赞[17193] 回复 06月05日 11:13 来自网页
[热门]Dear迪丽热巴后援会VM:一起保护野生动物[给你小心心] 举报 赞[15760] 回复 06月05日 11:12 来自网页
心动小巴 :姐姐你是我的榜样~  举报   赞[0]  回复   42分钟前 来自网页
给肖战热巴摘星星 :跟迪迪一起好好保护野生动物[心]  举报   赞[0]  回复   06月18日 23:22 来自网页
唯独爱你0603 :♥️♥️♥️  举报   赞[0]  回复   06月17日 23:08 来自网页
山野千里57383 :迪丽热巴  举报   赞[0]  回复   06月17日 22:19 来自网页
罗兰小幸福-1988 :嗯热巴姐姐说的对  举报   赞[1]  回复   06月17日 22:02 来自网页
ColumbiaYemenRussia :热巴,加油哦  举报   赞[0]  回复   06月17日 17:11 来自网页
江南很难 M :离谱  举报   赞[0]  回复   06月17日 14:56 来自网页
下页  1/100000页
\ No newline at end of file diff --git a/tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html b/tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html index 9e6330f9..082bdba1 100644 --- a/tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html +++ b/tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html @@ -1 +1 @@ -微博
图片加载中... 1/2 原图
图片加载中... 2/2 原图
\ No newline at end of file +微博
图片加载中... 1/2 原图
图片加载中... 2/2 原图
\ No newline at end of file diff --git a/tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html b/tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html index 280d824a..ef861589 100644 --- a/tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html +++ b/tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html @@ -1 +1 @@ -Dear-迪丽热巴的微博
头像
Dear-迪丽热巴VM 女/上海   加关注
认证:嘉行传媒签约演员 
一只喜欢默默表演的小透明。工作联系jaywalk@jaywalk.com.cn ...
私信 资料 操作 特别关注 送Ta会员
 微博  相册 
#一起热爱就现在#给你们康康我眼前的画面[嘻嘻] 绿洲
图片 原图 
赞[508798] 转发[1000000] 评论[391666] 收藏 06月17日 18:12 来自绿洲APP
刚收到我定制的亓那眼镜,猜猜定制了什么[doge]好奇?没关系,你们也可以拥有自己的定制眼镜。关注@QINA亓那眼镜 解锁6月限定惊喜,#时髦寻宝计划# 线上线下都安排了[偷笑]QINA亓那眼镜的微博视频  
赞[414547] 转发[1000000] 评论[1000000] 收藏 06月15日 10:09
#idoltube##周放vlog# 什么?放放子还有两副面孔呢?[喵喵] #幸福触手可及# Dear-迪丽热巴的微博视频  
赞[317796] 转发[1000000] 评论[514532] 收藏 06月14日 20:09 来自影视剪辑 · 视频社区
图片 原图 
赞[1150003] 转发[1000000] 评论[1000000] 收藏 06月12日 19:11 来自绿洲APP
言出必行,说了18张就是18张,送给七千万的你们 ~  [组图共18张]
放放子缺个快板[偷笑] 绿洲
图片 原图 
赞[571684] 转发[1000000] 评论[1000000] 收藏 06月08日 15:17 来自绿洲APP
转发了 WCS野生生物保护学会V 的微博:去年和亲善大使热巴@Dear-迪丽热巴 的特别回忆[心]。我们在藏北羌塘一起爬山,探访藏羚羊、雪豹、黑颈鹤的栖息地,感受野生动物保护工作的点滴。此时此刻,我们比以往更加重视与自然相处的方式,我们也从未如此迫切需要将想法付诸行动。热巴已经和我们@北京绿色阳光 站在一起,希望看完视频的你们,也...全文 赞[119295] 原文转发[1000000] 原文评论[38687]
转发理由:在羌塘的美好回忆~第一次来到这片独特的荒野,看到野生动物自由生活,还有一群快乐可爱的人在守护着它们。把这些美好留存下来,关注野生动物保护,积极行动,我们每个人都能贡献力量。[心]  
赞[554383] 转发[1000000] 评论[1000000] 收藏 06月05日 11:11
要开心。要充实。
#微博live秀# 28岁的直播~@Dear-迪丽热巴 的一直播(下载App->http://t.cn/RDUuslr 
赞[435625] 转发[23583] 评论[1000000] 收藏 06月03日 19:00 来自一直播Yi
下页  1/117页
TOP
彩版|触屏|语音
weibo.cn[06-18 23:37]
\ No newline at end of file +Dear-迪丽热巴的微博
头像
Dear-迪丽热巴VM 女/上海   加关注
认证:嘉行传媒签约演员 
一只喜欢默默表演的小透明。工作联系jaywalk@jaywalk.com.cn ...
私信 资料 操作 特别关注 送Ta会员
 微博  相册 
#一起热爱就现在#给你们康康我眼前的画面[嘻嘻] 绿洲
图片 原图 
赞[523801] 转发[1000000] 评论[393530] 收藏 06月17日 18:12 来自绿洲APP
刚收到我定制的亓那眼镜,猜猜定制了什么[doge]好奇?没关系,你们也可以拥有自己的定制眼镜。关注@QINA亓那眼镜 解锁6月限定惊喜,#时髦寻宝计划# 线上线下都安排了[偷笑]QINA亓那眼镜的微博视频  
赞[415899] 转发[1000000] 评论[1000000] 收藏 06月15日 10:09
#idoltube##周放vlog# 什么?放放子还有两副面孔呢?[喵喵] #幸福触手可及# Dear-迪丽热巴的微博视频  
赞[318054] 转发[1000000] 评论[514546] 收藏 06月14日 20:09 来自影视剪辑 · 视频社区
图片 原图 
赞[1150265] 转发[1000000] 评论[1000000] 收藏 06月12日 19:11 来自绿洲APP
言出必行,说了18张就是18张,送给七千万的你们 ~  [组图共18张]
放放子缺个快板[偷笑] 绿洲
图片 原图 
赞[571755] 转发[1000000] 评论[1000000] 收藏 06月08日 15:17 来自绿洲APP
转发了 WCS野生生物保护学会V 的微博:去年和亲善大使热巴@Dear-迪丽热巴 的特别回忆[心]。我们在藏北羌塘一起爬山,探访藏羚羊、雪豹、黑颈鹤的栖息地,感受野生动物保护工作的点滴。此时此刻,我们比以往更加重视与自然相处的方式,我们也从未如此迫切需要将想法付诸行动。热巴已经和我们@北京绿色阳光 站在一起,希望看完视频的你们,也...全文 赞[119296] 原文转发[1000000] 原文评论[38688]
转发理由:在羌塘的美好回忆~第一次来到这片独特的荒野,看到野生动物自由生活,还有一群快乐可爱的人在守护着它们。把这些美好留存下来,关注野生动物保护,积极行动,我们每个人都能贡献力量。[心]  
赞[554415] 转发[1000000] 评论[1000000] 收藏 06月05日 11:11
要开心。要充实。
#微博live秀# 28岁的直播~@Dear-迪丽热巴 的一直播(下载App->http://t.cn/RDUuslr 
赞[435650] 转发[23584] 评论[1000000] 收藏 06月03日 19:00 来自一直播Yi
下页  1/117页
TOP
彩版|触屏|语音
weibo.cn[06-19 00:47]
\ No newline at end of file diff --git a/tests/testdata/ca5f2a555e8d62f728c66fa90afb2d54d19f8c898e164204a61bdf03.html b/tests/testdata/ca5f2a555e8d62f728c66fa90afb2d54d19f8c898e164204a61bdf03.html index 2990dbe4..10eab1d7 100644 --- a/tests/testdata/ca5f2a555e8d62f728c66fa90afb2d54d19f8c898e164204a61bdf03.html +++ b/tests/testdata/ca5f2a555e8d62f728c66fa90afb2d54d19f8c898e164204a61bdf03.html @@ -1 +1 @@ -Dear-迪丽热巴的资料
头像
会员等级:7级 送Ta会员
微身份 语惊四座 七步成诗 谈笑风生 更多勋章
基本信息
昵称:Dear-迪丽热巴
认证:嘉行传媒签约演员 
性别:女
地区:上海
生日:双子座
认证信息:嘉行传媒签约演员 
简介:一只喜欢默默表演的小透明。工作联系jaywalk@jaywalk.com.cn 🍒
学习经历
·上海戏剧学院
工作经历
·嘉行传媒 
其他信息
互联网:http://weibo.com/u/1669879400
手机版:https://weibo.cn/u/1669879400
她的相册>>
TOP
彩版|触屏|语音
weibo.cn[06-18 23:37]
\ No newline at end of file +Dear-迪丽热巴的资料
头像
会员等级:7级 送Ta会员
微身份 语惊四座 七步成诗 谈笑风生 更多勋章
基本信息
昵称:Dear-迪丽热巴
认证:嘉行传媒签约演员 
性别:女
地区:上海
生日:双子座
认证信息:嘉行传媒签约演员 
简介:一只喜欢默默表演的小透明。工作联系jaywalk@jaywalk.com.cn 🍒
学习经历
·上海戏剧学院
工作经历
·嘉行传媒 
其他信息
互联网:http://weibo.com/u/1669879400
手机版:https://weibo.cn/u/1669879400
她的相册>>
TOP
彩版|触屏|语音
weibo.cn[06-19 00:47]
\ No newline at end of file diff --git a/tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html b/tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html index bfb31fe9..7bac1628 100644 --- a/tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html +++ b/tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html @@ -1 +1 @@ -Dear-迪丽热巴的微博
Dear-迪丽热巴的微博 加关注
 微博  相册 
粉色天空、闪耀夜色、浪漫爱意…我把我喜爱的元素和巴黎限定记忆全部定格在这一瓶#YSL反转巴黎#热爱限定中,第一次与YSL一起合作设计香水,在这#拦不住的夏天#把甜甜的曼陀罗花香送给你们,喜欢吗?💓  [组图共2张]
#幸福触手可及开播##幸福触手可及# 度量自身,方能修炼精彩人生。追梦不易,披荆斩棘。今晚八点@湖南卫视 和周放,一起守护梦想,书写初夏。
很高兴成为力士大中华区沐浴系列代言人,520就要到啦,大家快来接收告白福利哦!全新植萃泡泡沐浴露让每一位小仙女都能在浓密泡泡浴中拥有夏日嫩白肌,仙气香气都十足!关注@LUX力士 第一时间锁定新品哦!LUX力士的微博视频  
赞[741400] 转发[1000000] 评论[1000000] 收藏 05月19日 09:03
转发了 电视剧幸福触手可及VM 的微博:#幸福触手可及##幸福触手可及定档0519# 从没有一个时刻,幸福如此靠近,只因有你在身边[心]5月19日20:00锁定@湖南卫视 金鹰独播剧场,@优酷 @爱奇艺 @腾讯视频 24点同步更新,等你解锁初夏甜梦!
图片 原图 赞[129675] 原文转发[332651] 原文评论[6900]
转发理由:#幸福触手可及定档0519# 唯有热爱,不负韶华,为之全力以赴,才能成为更优秀的人。5月19日20:00锁定湖南卫视#幸福触手可及# ,愈挫愈勇的独立设计师周放来啦。  
赞[480417] 转发[57012] 评论[40966] 收藏 05月15日 20:23
哈哈哈哈哈哈👅
转发了 北京2022年冬奥会VM 的微博:【爱豆喊你来助力#北京2022#
花样滑冰,旋转跳跃 ,“迪丽”前行 @Dear-迪丽热巴 北京2022年冬奥会的微博视频
 赞[680201] 原文转发[1450782] 原文评论[50694]
转发理由:与我一起,关注花样滑冰,为中国健儿鼓劲加油[加油]  
赞[501777] 转发[1000000] 评论[1000000] 收藏 05月15日 10:20
转发了 央视网VM 的微博:【想看看战疫一线医护人员们的脸!#极限挑战致敬医护人员#】脱下防疫服,援鄂人员们原来是这个模样。八位医护人员集体分享支援一线的故事,是他们为后方的我们竖起了最坚实的屏障,感谢这群医护天使的负重前行,致敬!@央视网青年 @雷佳音 @岳云鹏 @演员王迅 @贾乃亮 @努力努力再努力x @Dear-迪丽热巴...全文 赞[364004] 原文转发[1056354] 原文评论[3645]
转发理由:#极限挑战# 感谢你们的守护,最美的逆行者们[心]  
赞[571255] 转发[1000000] 评论[362127] 收藏 05月10日 23:01
#极限挑战# 无奖填词竞答,今晚看👉登峰造_,不可_量,百里_一,南征北_~ Dear-迪丽热巴的微博视频  
赞[731517] 转发[1000000] 评论[1000000] 收藏 05月10日 16:50
转发了 中国青年报VM 的微博:#五四致敬战疫青年# #青春万岁#各地应急响应级别陆续下调,我们正在走向痊愈。回望这些年轻医务人员的脸,不应忘记,正是他们在危难之下,白衣执甲,毅然逆行,为我们筑起血肉长城。感恩提灯天使,致敬最可爱的人!春暖花开,等到疫情完全解除,无论你是从医还是就医,请记住医患之间的休戚与共、唇齿...全文 [组图共12张]
图片 原图 赞[32125] 原文转发[4801627] 原文评论[6975]
转发理由:#五四致敬战疫青年#五四青年节前夕,让我们说一声,#谢谢你保护了我们#!  
赞[721484] 转发[1000000] 评论[597487] 收藏 05月02日 16:40
转发了 东方卫视极限挑战VM 的微博:鸡条君目睹了vivo#极限挑战#第六季首发阵容@雷佳音 @岳云鹏 @演员王迅 @贾乃亮 @努力努力再努力x @Dear-迪丽热巴 @郭京飞 @邓伦 集结的整个过程,这就是欢迎新人的方式[疑问]说好要相亲相爱的呢😂东方卫视极限挑战的微博视频  赞[711006] 原文转发[1409504] 原文评论[14761]
转发理由:#极限挑战#举手之劳,岳岳哥别客气!//@岳云鹏:#极限挑战#谢谢热巴@Dear-迪丽热巴 给我p图,我这里还有好多库存 查看图片  
赞[983375] 转发[1000000] 评论[1000000] 收藏 04月30日 12:30
下页 上页 首页  3/117页
TOP
彩版|触屏|语音
weibo.cn[06-18 23:38]
\ No newline at end of file +Dear-迪丽热巴的微博
Dear-迪丽热巴的微博 加关注
 微博  相册 
粉色天空、闪耀夜色、浪漫爱意…我把我喜爱的元素和巴黎限定记忆全部定格在这一瓶#YSL反转巴黎#热爱限定中,第一次与YSL一起合作设计香水,在这#拦不住的夏天#把甜甜的曼陀罗花香送给你们,喜欢吗?💓  [组图共2张]
#幸福触手可及开播##幸福触手可及# 度量自身,方能修炼精彩人生。追梦不易,披荆斩棘。今晚八点@湖南卫视 和周放,一起守护梦想,书写初夏。
很高兴成为力士大中华区沐浴系列代言人,520就要到啦,大家快来接收告白福利哦!全新植萃泡泡沐浴露让每一位小仙女都能在浓密泡泡浴中拥有夏日嫩白肌,仙气香气都十足!关注@LUX力士 第一时间锁定新品哦!LUX力士的微博视频  
赞[741402] 转发[1000000] 评论[1000000] 收藏 05月19日 09:03
转发了 电视剧幸福触手可及VM 的微博:#幸福触手可及##幸福触手可及定档0519# 从没有一个时刻,幸福如此靠近,只因有你在身边[心]5月19日20:00锁定@湖南卫视 金鹰独播剧场,@优酷 @爱奇艺 @腾讯视频 24点同步更新,等你解锁初夏甜梦!
图片 原图 赞[129675] 原文转发[332651] 原文评论[6900]
转发理由:#幸福触手可及定档0519# 唯有热爱,不负韶华,为之全力以赴,才能成为更优秀的人。5月19日20:00锁定湖南卫视#幸福触手可及# ,愈挫愈勇的独立设计师周放来啦。  
赞[480418] 转发[57012] 评论[40966] 收藏 05月15日 20:23
哈哈哈哈哈哈👅
转发了 北京2022年冬奥会VM 的微博:【爱豆喊你来助力#北京2022#
花样滑冰,旋转跳跃 ,“迪丽”前行 @Dear-迪丽热巴 北京2022年冬奥会的微博视频
 赞[680201] 原文转发[1450782] 原文评论[50694]
转发理由:与我一起,关注花样滑冰,为中国健儿鼓劲加油[加油]  
赞[501777] 转发[1000000] 评论[1000000] 收藏 05月15日 10:20
转发了 央视网VM 的微博:【想看看战疫一线医护人员们的脸!#极限挑战致敬医护人员#】脱下防疫服,援鄂人员们原来是这个模样。八位医护人员集体分享支援一线的故事,是他们为后方的我们竖起了最坚实的屏障,感谢这群医护天使的负重前行,致敬!@央视网青年 @雷佳音 @岳云鹏 @演员王迅 @贾乃亮 @努力努力再努力x @Dear-迪丽热巴...全文 赞[364004] 原文转发[1056354] 原文评论[3645]
转发理由:#极限挑战# 感谢你们的守护,最美的逆行者们[心]  
赞[571256] 转发[1000000] 评论[362127] 收藏 05月10日 23:01
#极限挑战# 无奖填词竞答,今晚看👉登峰造_,不可_量,百里_一,南征北_~ Dear-迪丽热巴的微博视频  
赞[731516] 转发[1000000] 评论[1000000] 收藏 05月10日 16:50
转发了 中国青年报VM 的微博:#五四致敬战疫青年# #青春万岁#各地应急响应级别陆续下调,我们正在走向痊愈。回望这些年轻医务人员的脸,不应忘记,正是他们在危难之下,白衣执甲,毅然逆行,为我们筑起血肉长城。感恩提灯天使,致敬最可爱的人!春暖花开,等到疫情完全解除,无论你是从医还是就医,请记住医患之间的休戚与共、唇齿...全文 [组图共12张]
图片 原图 赞[32125] 原文转发[4801631] 原文评论[6975]
转发理由:#五四致敬战疫青年#五四青年节前夕,让我们说一声,#谢谢你保护了我们#!  
赞[721484] 转发[1000000] 评论[597487] 收藏 05月02日 16:40
转发了 东方卫视极限挑战VM 的微博:鸡条君目睹了vivo#极限挑战#第六季首发阵容@雷佳音 @岳云鹏 @演员王迅 @贾乃亮 @努力努力再努力x @Dear-迪丽热巴 @郭京飞 @邓伦 集结的整个过程,这就是欢迎新人的方式[疑问]说好要相亲相爱的呢😂东方卫视极限挑战的微博视频  赞[711013] 原文转发[1409505] 原文评论[14761]
转发理由:#极限挑战#举手之劳,岳岳哥别客气!//@岳云鹏:#极限挑战#谢谢热巴@Dear-迪丽热巴 给我p图,我这里还有好多库存 查看图片  
赞[983376] 转发[1000000] 评论[1000000] 收藏 04月30日 12:30
下页 上页 首页  3/117页
TOP
彩版|触屏|语音
weibo.cn[06-19 00:47]
\ No newline at end of file diff --git a/tests/testdata/e97222acd5bc7d8d1bfbd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html b/tests/testdata/e97222acd5bc7d8d1bfbd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html index e6ef95e4..7ca2ef2c 100644 --- a/tests/testdata/e97222acd5bc7d8d1bfbd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html +++ b/tests/testdata/e97222acd5bc7d8d1bfbd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html @@ -1 +1 @@ -微博
图片加载中... 1/2 原图
图片加载中... 2/2 原图
\ No newline at end of file +微博
图片加载中... 1/2 原图
图片加载中... 2/2 原图
\ No newline at end of file diff --git a/tests/testdata/url_map.json b/tests/testdata/url_map.json index b51dd445..f762bcb1 100644 --- a/tests/testdata/url_map.json +++ b/tests/testdata/url_map.json @@ -6,5 +6,6 @@ "https://weibo.cn/mblog/picAll/J5ZcSnCAg?rl=1": "tests/testdata/63a98849ec82b2c87ec55bca03cbf5988f7eac233a23d86b4fdf5ffd.html", "https://weibo.cn/1669879400?page=2": "tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html", "https://weibo.cn/1669879400?page=3": "tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html", - "https://weibo.cn/mblog/picAll/J3xfm61AZ?rl=1": "tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html" + "https://weibo.cn/mblog/picAll/J3xfm61AZ?rl=1": "tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html", + "https://weibo.cn/comment/J5cVGuUNq": "tests/testdata/4d5ed0a3ebd0303cb45edd544dbc0ab5e86d43e103405f0c60515884.html" } \ No newline at end of file diff --git a/tests/util.py b/tests/util.py new file mode 100644 index 00000000..19292ecc --- /dev/null +++ b/tests/util.py @@ -0,0 +1,15 @@ +from unittest.mock import Mock +import json +import os + +from weibo_spider.parser.util import TEST_DATA_DIR, URL_MAP_FILE + + +def mock_request_get_content(url, cookies): + with open(os.path.join(TEST_DATA_DIR, URL_MAP_FILE)) as f: + url_map = json.loads(f.read()) + resp_file = url_map[url] + mock = Mock() + with open(resp_file, "rb") as f: + mock.content = f.read() + return mock From 8d7378895b9d1cc36268035229216142fb0834b2 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Fri, 19 Jun 2020 01:02:14 +0800 Subject: [PATCH 219/363] tests[#168]: add test for index_parser. --- tests/test_index_parser.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 tests/test_index_parser.py diff --git a/tests/test_index_parser.py b/tests/test_index_parser.py new file mode 100644 index 00000000..65c70479 --- /dev/null +++ b/tests/test_index_parser.py @@ -0,0 +1,15 @@ +from unittest.mock import patch + +from .util import mock_request_get_content +from weibo_spider.parser.index_parser import IndexParser + + +@patch('requests.get', mock_request_get_content) +def test_page_parser(): + index_parser = IndexParser(cookie="", user_uri="1669879400") + assert (index_parser.get_page_num() == 117) + assert (str(index_parser.get_user()) == """用户昵称: Dear-迪丽热巴\n""" + """用户id: 1669879400\n""" + """微博数: 1159\n""" + """关注数: 253\n""" + """粉丝数: 70805574\n""") From b61748363631cc08b8f870b91640b125495cbf00 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Fri, 19 Jun 2020 01:08:19 +0800 Subject: [PATCH 220/363] tests[#168]: add test for info_parser --- tests/test_info_parser.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 tests/test_info_parser.py diff --git a/tests/test_info_parser.py b/tests/test_info_parser.py new file mode 100644 index 00000000..2de748d2 --- /dev/null +++ b/tests/test_info_parser.py @@ -0,0 +1,12 @@ +from unittest.mock import patch + +from .util import mock_request_get_content +from weibo_spider.parser.info_parser import InfoParser + + +@patch('requests.get', mock_request_get_content) +def test_page_parser(): + info_parser = InfoParser(cookie="", user_id="1669879400") + user = info_parser.extract_user_info() + # With info_parser, we can only get the nickname. + assert (user.nickname == "Dear-迪丽热巴") From 4a042ec4f70a1ae867f5d358cc161b31268a5412 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Fri, 19 Jun 2020 01:10:58 +0800 Subject: [PATCH 221/363] tests[#168]: add test for mblog_picAll_parser --- tests/test_mblog_picAll_parser.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 tests/test_mblog_picAll_parser.py diff --git a/tests/test_mblog_picAll_parser.py b/tests/test_mblog_picAll_parser.py new file mode 100644 index 00000000..770a04e7 --- /dev/null +++ b/tests/test_mblog_picAll_parser.py @@ -0,0 +1,15 @@ +from unittest.mock import patch + +from .util import mock_request_get_content +from weibo_spider.parser.mblog_picAll_parser import MblogPicAllParser + + +@patch('requests.get', mock_request_get_content) +def test_page_parser(): + mblog_picAll_parser = MblogPicAllParser(cookie="", weibo_id="J5ZcSnCAg") + preview_picture_list = mblog_picAll_parser.extract_preview_picture_list() + # With info_parser, we can only get the nickname. + assert (len(preview_picture_list) == 18) + assert ( + preview_picture_list[0] == + 'http://ww3.sinaimg.cn/thumb180/63885668ly1gfn5qz5m1yj20u0140472.jpg') From bb32883ff81e32785cfc91743add77f73474381e Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Fri, 19 Jun 2020 01:15:30 +0800 Subject: [PATCH 222/363] tests[#168]: move the parser tests to folder tests/test_parser/ --- tests/test_parser/__init__.py | 0 tests/{ => test_parser}/test_comment_parser.py | 0 tests/{ => test_parser}/test_index_parser.py | 0 tests/{ => test_parser}/test_info_parser.py | 0 tests/{ => test_parser}/test_mblog_picAll_parser.py | 0 tests/{ => test_parser}/test_page_parser.py | 0 tests/{ => test_parser}/util.py | 0 7 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/test_parser/__init__.py rename tests/{ => test_parser}/test_comment_parser.py (100%) rename tests/{ => test_parser}/test_index_parser.py (100%) rename tests/{ => test_parser}/test_info_parser.py (100%) rename tests/{ => test_parser}/test_mblog_picAll_parser.py (100%) rename tests/{ => test_parser}/test_page_parser.py (100%) rename tests/{ => test_parser}/util.py (100%) diff --git a/tests/test_parser/__init__.py b/tests/test_parser/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_comment_parser.py b/tests/test_parser/test_comment_parser.py similarity index 100% rename from tests/test_comment_parser.py rename to tests/test_parser/test_comment_parser.py diff --git a/tests/test_index_parser.py b/tests/test_parser/test_index_parser.py similarity index 100% rename from tests/test_index_parser.py rename to tests/test_parser/test_index_parser.py diff --git a/tests/test_info_parser.py b/tests/test_parser/test_info_parser.py similarity index 100% rename from tests/test_info_parser.py rename to tests/test_parser/test_info_parser.py diff --git a/tests/test_mblog_picAll_parser.py b/tests/test_parser/test_mblog_picAll_parser.py similarity index 100% rename from tests/test_mblog_picAll_parser.py rename to tests/test_parser/test_mblog_picAll_parser.py diff --git a/tests/test_page_parser.py b/tests/test_parser/test_page_parser.py similarity index 100% rename from tests/test_page_parser.py rename to tests/test_parser/test_page_parser.py diff --git a/tests/util.py b/tests/test_parser/util.py similarity index 100% rename from tests/util.py rename to tests/test_parser/util.py From 1b219a901707801ae030976e550ac6c81861c3df Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Fri, 19 Jun 2020 01:23:23 +0800 Subject: [PATCH 223/363] Create python-app.yml --- .github/workflows/python-app.yml | 36 ++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 .github/workflows/python-app.yml diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml new file mode 100644 index 00000000..c7f50674 --- /dev/null +++ b/.github/workflows/python-app.yml @@ -0,0 +1,36 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python application + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest From 45ad20351d97e4f6233aa44be48c684e2d1268d2 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Fri, 19 Jun 2020 01:29:46 +0800 Subject: [PATCH 224/363] Update README.md Add the badge for workflow "Python application". --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index adf75f1f..1145959a 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +![Build Status](https://github.com/dataabc/weiboSpider/workflows/Python%20application/badge.svg) + # Weibo Spider 本程序可以连续爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入**文件**或**数据库**。写入信息几乎包括用户微博的所有数据,包括**用户信息**和**微博信息**两大类。因为内容太多,这里不再赘述,详细内容见[获取到的字段](#获取到的字段)。如果只需要用户信息,可以通过设置实现只爬取微博用户信息的功能。本程序需设置cookie来获取微博访问权限,后面会讲解[如何获取cookie](#如何获取cookie)。如果不想设置cookie,可以使用[免cookie版](https://github.com/dataabc/weibo-crawler),二者功能类似。 From 87b3189f221d676e3a98a926e6e060ea58d567b9 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Fri, 19 Jun 2020 02:14:39 +0800 Subject: [PATCH 225/363] fix the test function names --- tests/test_parser/test_index_parser.py | 2 +- tests/test_parser/test_info_parser.py | 2 +- tests/test_parser/test_mblog_picAll_parser.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_parser/test_index_parser.py b/tests/test_parser/test_index_parser.py index 65c70479..3c2ef26b 100644 --- a/tests/test_parser/test_index_parser.py +++ b/tests/test_parser/test_index_parser.py @@ -5,7 +5,7 @@ @patch('requests.get', mock_request_get_content) -def test_page_parser(): +def test_index_parser(): index_parser = IndexParser(cookie="", user_uri="1669879400") assert (index_parser.get_page_num() == 117) assert (str(index_parser.get_user()) == """用户昵称: Dear-迪丽热巴\n""" diff --git a/tests/test_parser/test_info_parser.py b/tests/test_parser/test_info_parser.py index 2de748d2..cc3aa9f3 100644 --- a/tests/test_parser/test_info_parser.py +++ b/tests/test_parser/test_info_parser.py @@ -5,7 +5,7 @@ @patch('requests.get', mock_request_get_content) -def test_page_parser(): +def test_info_parser(): info_parser = InfoParser(cookie="", user_id="1669879400") user = info_parser.extract_user_info() # With info_parser, we can only get the nickname. diff --git a/tests/test_parser/test_mblog_picAll_parser.py b/tests/test_parser/test_mblog_picAll_parser.py index 770a04e7..5dbdf530 100644 --- a/tests/test_parser/test_mblog_picAll_parser.py +++ b/tests/test_parser/test_mblog_picAll_parser.py @@ -5,7 +5,7 @@ @patch('requests.get', mock_request_get_content) -def test_page_parser(): +def test_mblog_picAll_parser(): mblog_picAll_parser = MblogPicAllParser(cookie="", weibo_id="J5ZcSnCAg") preview_picture_list = mblog_picAll_parser.extract_preview_picture_list() # With info_parser, we can only get the nickname. From ee981dbb235a230882dc379ae498876b8371a823 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sat, 20 Jun 2020 14:12:00 +0800 Subject: [PATCH 226/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=9B=A0user?= =?UTF-8?q?=5Fid=5Flist.txt=E6=96=87=E4=BB=B6=E4=B8=8D=E6=AD=A3=E7=A1=AE?= =?UTF-8?q?=E5=87=BA=E9=94=99=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/config_util.py | 2 +- weibo_spider/weibo_spider.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 12f6f2b8..8eead2a4 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -61,7 +61,7 @@ def get_user_config_list(file_name, default_since_date): user_config_list = [] for line in lines: info = line.split(" ") - if len(info) > 0: + if len(info) > 0 and info[0].isdigit(): user_config = {} user_config["user_uri"] = info[0] if len(info) > 2 and _is_date(info[2]): diff --git a/weibo_spider/weibo_spider.py b/weibo_spider/weibo_spider.py index d944d4a4..25819c7b 100644 --- a/weibo_spider/weibo_spider.py +++ b/weibo_spider/weibo_spider.py @@ -189,6 +189,10 @@ def initialize_info(self, user_config): def start(self): """运行爬虫""" try: + if not self.user_config_list: + print( + u'没有配置有效的user_id,请通过config.json或user_id_list.txt配置user_id') + return for user_config in self.user_config_list: self.get_user_info(user_config["user_uri"]) print(self.user) From 0258250822e0aa8d87cea21ac0d28ee77c2bca60 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sun, 21 Jun 2020 17:08:41 +0800 Subject: [PATCH 227/363] Update README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 1145959a..90eac211 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ - [定期自动爬取微博(可选)](#定期自动爬取微博可选) - [如何获取cookie](#如何获取cookie) - [如何获取user_id](#如何获取user_id) + - [常见问题](#常见问题) - [相关项目](#相关项目) - [注意事项](#注意事项) @@ -207,6 +208,10 @@ $ python3 -m weibo_spider --config_path="config.json" 要了解获取user_id方法,请查看[user_id文档](https://github.com/dataabc/weiboSpider/blob/master/docs/userid.md),该文档介绍了如何获取一个及多个微博用户user_id的方法。 +## 常见问题 + +如果运行程序的过程中出现错误,可以查看[常见问题](https://github.com/dataabc/weiboSpider/blob/master/docs/FAQ.md)页面,里面包含了最常见的问题及解决方法。如果出现的错误不在常见问题里,您可以通过[发issue](https://github.com/dataabc/weiboSpider/issues/new/choose)寻求帮助,我们会很乐意为您解答。 + ## 相关项目 - [weibo-crawler](https://github.com/dataabc/weibo-crawler) - 功能和本项目完全一样,可以不添加cookie,获取的微博属性更多; @@ -215,4 +220,5 @@ $ python3 -m weibo_spider --config_path="config.json" ## 注意事项 1.user_id不能为爬虫微博的user_id。因为要爬微博信息,必须先登录到某个微博账号,此账号我们姑且称为爬虫微博。爬虫微博访问自己的页面和访问其他用户的页面,得到的网页格式不同,所以无法爬取自己的微博信息;如果想要爬取爬虫微博内容,可以参考[获取自身微博信息](https://github.com/dataabc/weiboSpider/issues/113); + 2.cookie有期限限制,大约三个月。若提示cookie错误或已过期,需要重新更新cookie。 From 349a39985e519cd8120bfbaecfea254fda412ee2 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sun, 21 Jun 2020 17:14:55 +0800 Subject: [PATCH 228/363] Update failed.md --- .github/ISSUE_TEMPLATE/failed.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/failed.md b/.github/ISSUE_TEMPLATE/failed.md index 779d3fb7..142407d5 100644 --- a/.github/ISSUE_TEMPLATE/failed.md +++ b/.github/ISSUE_TEMPLATE/failed.md @@ -21,7 +21,7 @@ assignees: '' - 问:若只有爬特定微博时才出错,能否提供出错微博的weibo_id或url(非必填)?
答: -- 问:若您已提供出错微博的weibo_id或url,可忽略此内容,否则能否提供出错账号的**user_id**(非必填)?
+- 问:若您已提供出错微博的weibo_id或url,可忽略此内容,否则能否提供出错账号的**user_id**及您配置的**since_date**,方便我们定位出错微博(非必填)?
答: - 问:如果方便,请您描述出错详情,最好附上错误提示。
From d4584d6c2523f0c901eab1cd6fa927fbdcaf9e48 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sun, 21 Jun 2020 17:17:18 +0800 Subject: [PATCH 229/363] Update bug-report.md --- .github/ISSUE_TEMPLATE/bug-report.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md index 1e8b9ee7..38d1e3d6 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.md +++ b/.github/ISSUE_TEMPLATE/bug-report.md @@ -23,7 +23,7 @@ assignees: '' - 问:若只有爬特定微博时才出bug,能否提供出错微博的weibo_id或url(非必填)?
答: -- 问:若您已提供出错微博的weibo_id或url,可忽略此内容,否则能否提供出错账号的**user_id**(非必填)?
+- 问:若您已提供出错微博的weibo_id或url,可忽略此内容,否则能否提供出错账号的**user_id**及您配置的**since_date**,方便我们定位出错微博(非必填)?
答: - 问:如果方便,请您描述bug详情,如果代码报错,最好附上错误提示。
From 5eb8dc01e565fea2aead3b6a7316d73661417fc1 Mon Sep 17 00:00:00 2001 From: dataabc Date: Thu, 25 Jun 2020 18:30:27 +0800 Subject: [PATCH 230/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E6=8C=89=E7=85=A7user=5Fid=5Flist.txt=E4=B8=BA?= =?UTF-8?q?=E5=A4=9A=E4=B8=AAid=E5=8D=95=E7=8B=AC=E8=AE=BE=E7=BD=AEsince?= =?UTF-8?q?=5Fdate=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/weibo_spider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weibo_spider/weibo_spider.py b/weibo_spider/weibo_spider.py index 25819c7b..51ec548d 100644 --- a/weibo_spider/weibo_spider.py +++ b/weibo_spider/weibo_spider.py @@ -100,7 +100,7 @@ def get_weibo_info(self): weibos, self.weibo_id_list = PageParser( self.cookie, self.user_config["user_uri"], page, self.filter).get_one_page( - self.since_date, + self.user_config['since_date'], self.weibo_id_list) # 获取第page页的全部微博 print(u"{}已获取{}({})的第{}页微博{}".format( "-" * 30, From 5b9278f684dac047a43bccad36211de7c52b5409 Mon Sep 17 00:00:00 2001 From: dataabc Date: Thu, 25 Jun 2020 21:18:13 +0800 Subject: [PATCH 231/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E6=8C=89?= =?UTF-8?q?=E6=97=B6=E9=97=B4=E6=AE=B5=E8=8E=B7=E5=8F=96=E5=BE=AE=E5=8D=9A?= =?UTF-8?q?=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #172 --- tests/test_parser/test_page_parser.py | 12 +++++++++--- weibo_spider/config_sample.json | 3 ++- weibo_spider/parser/page_parser.py | 18 +++++++++++++----- weibo_spider/weibo_spider.py | 14 +++++++++----- 4 files changed, 33 insertions(+), 14 deletions(-) diff --git a/tests/test_parser/test_page_parser.py b/tests/test_parser/test_page_parser.py index a6b2b211..4eb637ab 100644 --- a/tests/test_parser/test_page_parser.py +++ b/tests/test_parser/test_page_parser.py @@ -1,16 +1,22 @@ from unittest.mock import patch -from .util import mock_request_get_content from weibo_spider.parser.page_parser import PageParser +from .util import mock_request_get_content + @patch('requests.get', mock_request_get_content) def test_page_parser(): + user_config = { + 'user_uri': '1669879400', + 'since_date': '2020-06-01', + 'end_date': 'now' + } page_parser = PageParser(cookie="", - user_uri="1669879400", + user_config=user_config, page=2, filter=True) - weibos, weibo_id_list = page_parser.get_one_page("2020-06-01", []) + weibos, weibo_id_list = page_parser.get_one_page([]) assert (weibo_id_list == ['J4PGk4yMw', 'J4EUStJKu']) assert (len(weibos) == 2) assert (str(weibos[0]) == """生日动态 \xa0\n""" diff --git a/weibo_spider/config_sample.json b/weibo_spider/config_sample.json index 04db0f39..b0325c28 100644 --- a/weibo_spider/config_sample.json +++ b/weibo_spider/config_sample.json @@ -2,6 +2,7 @@ "user_id_list": ["1669879400"], "filter": 1, "since_date": "2018-01-01", + "end_date": "now", "write_mode": ["csv", "txt"], "pic_download": 1, "video_download": 1, @@ -13,4 +14,4 @@ "password": "123456", "charset": "utf8mb4" } -} +} \ No newline at end of file diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index 326892f2..e18d4ea9 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -8,27 +8,35 @@ from .. import datetime_util from ..weibo import Weibo from .comment_parser import CommentParser -from .parser import Parser from .mblog_picAll_parser import MblogPicAllParser +from .parser import Parser from .util import handle_garbled, handle_html class PageParser(Parser): - def __init__(self, cookie, user_uri, page, filter): + def __init__(self, cookie, user_config, page, filter): self.cookie = cookie + self.user_uri = user_config['user_uri'] + self.since_date = user_config['since_date'] + self.end_date = user_config['end_date'] self.page = page - self.url = "https://weibo.cn/%s?page=%d" % (user_uri, page) + self.url = "https://weibo.cn/%s?page=%d" % (self.user_uri, page) + if self.end_date != 'now': + starttime = self.since_date.replace('-', '') + endtime = self.end_date.replace('-', '') + self.url = 'https://weibo.cn/%s/profile?starttime=%s&endtime=%s&advancedfilter=1&page=%d' % ( + self.user_uri, starttime, endtime, page) self.selector = handle_html(self.cookie, self.url) self.filter = filter - def get_one_page(self, since_date, weibo_id_list): + def get_one_page(self, weibo_id_list): """获取第page页的全部微博""" try: info = self.selector.xpath("//div[@class='c']") is_exist = info[0].xpath("div/span[@class='ctt']") weibos = [] if is_exist: - since_date = datetime_util.str_to_time(since_date) + since_date = datetime_util.str_to_time(self.since_date) for i in range(0, len(info) - 2): weibo = self.get_one_weibo(info[i]) if weibo: diff --git a/weibo_spider/weibo_spider.py b/weibo_spider/weibo_spider.py index 51ec548d..e31262c1 100644 --- a/weibo_spider/weibo_spider.py +++ b/weibo_spider/weibo_spider.py @@ -32,7 +32,9 @@ def __init__(self, config): since_date = str(config["since_date"]) if since_date.isdigit(): since_date = str(date.today() - timedelta(int(since_date))) - self.since_date = since_date # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd + self.since_date = since_date # 起始时间,即爬取发布日期从该值到结束时间的微博,形式为yyyy-mm-dd + self.end_date = config[ + 'end_date'] # 结束时间,即爬取发布日期从起始时间到该值的微博,形式为yyyy-mm-dd,特殊值"now"代表现在 self.write_mode = config[ "write_mode"] # 结果信息保存类型,为list形式,可包含txt、csv、json、mongo和mysql五种类型 self.pic_download = config[ @@ -53,11 +55,14 @@ def __init__(self, config): self.user_config_file_path = user_id_list # 用户配置文件路径 user_config_list = config_util.get_user_config_list( user_id_list, self.since_date) + for user_config in user_config_list: + user_config['end_date'] = self.end_date else: self.user_config_file_path = "" user_config_list = [{ "user_uri": user_id, - "since_date": self.since_date + "since_date": self.since_date, + "end_date": self.end_date } for user_id in user_id_list] self.user_config_list = user_config_list # 要爬取的微博用户的user_config列表 self.user_config = {} # 用户配置,包含用户id和since_date @@ -98,9 +103,8 @@ def get_weibo_info(self): self.start_time = datetime.now().strftime("%Y-%m-%d %H:%M") for page in tqdm(range(1, page_num + 1), desc="Progress"): weibos, self.weibo_id_list = PageParser( - self.cookie, self.user_config["user_uri"], - page, self.filter).get_one_page( - self.user_config['since_date'], + self.cookie, + self.user_config, page, self.filter).get_one_page( self.weibo_id_list) # 获取第page页的全部微博 print(u"{}已获取{}({})的第{}页微博{}".format( "-" * 30, From 01eced145f6bed2e87d879500fb28e75dbc59de1 Mon Sep 17 00:00:00 2001 From: dataabc Date: Thu, 25 Jun 2020 21:51:15 +0800 Subject: [PATCH 232/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8Dyyyy-mm-dd=20H?= =?UTF-8?q?:M=E6=A0=BC=E5=BC=8F=E7=9A=84=E6=97=B6=E9=97=B4=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E5=87=86=E7=A1=AE=E8=AF=86=E5=88=AB=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/config_util.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 8eead2a4..6d6ec36d 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -4,10 +4,13 @@ from datetime import datetime -def _is_date(since_date): +def _is_date(date_str): """判断日期格式是否正确""" try: - datetime.strptime(since_date, "%Y-%m-%d") + if ':' in date_str: + datetime.strptime(date_str, '%Y-%m-%d %H:%M') + else: + datetime.strptime(date_str, '%Y-%m-%d') return True except ValueError: return False From 3ac424d4a58ec3c4b5fb0c357605ce43e8725b92 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Fri, 26 Jun 2020 13:40:54 +0800 Subject: [PATCH 233/363] refactor: change filename of weibo_spider.py to spider.py --- weibo_spider/__main__.py | 2 +- weibo_spider/{weibo_spider.py => spider.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename weibo_spider/{weibo_spider.py => spider.py} (100%) diff --git a/weibo_spider/__main__.py b/weibo_spider/__main__.py index e5808ef6..1961f82d 100644 --- a/weibo_spider/__main__.py +++ b/weibo_spider/__main__.py @@ -1,5 +1,5 @@ from absl import app -from .weibo_spider import main +from .spider import main app.run(main) diff --git a/weibo_spider/weibo_spider.py b/weibo_spider/spider.py similarity index 100% rename from weibo_spider/weibo_spider.py rename to weibo_spider/spider.py From b4e8b27f395967a7e56ae35b0bd8d747e9a71b68 Mon Sep 17 00:00:00 2001 From: dataabc Date: Fri, 26 Jun 2020 18:37:37 +0800 Subject: [PATCH 234/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E6=97=A5?= =?UTF-8?q?=E6=9C=9F=E6=A0=BC=E5=BC=8F=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/config_util.py | 5 +++++ weibo_spider/parser/page_parser.py | 10 ++++++++-- weibo_spider/spider.py | 1 + 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 6d6ec36d..49981fee 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -30,6 +30,11 @@ def validate_config(config): if (not _is_date(since_date)) and (not since_date.isdigit()): sys.exit(u"since_date值应为yyyy-mm-dd形式或整数,请重新输入") + # 验证end_date + end_date = str(config["end_date"]) + if (not _is_date(end_date)) and (end_date != 'now'): + sys.exit(u'end_date值应为yyyy-mm-dd形式或"now",请重新输入') + # 验证write_mode write_mode = ["txt", "csv", "json", "mongo", "mysql"] if not isinstance(config["write_mode"], list): diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index e18d4ea9..034c5c1c 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -22,8 +22,14 @@ def __init__(self, cookie, user_config, page, filter): self.page = page self.url = "https://weibo.cn/%s?page=%d" % (self.user_uri, page) if self.end_date != 'now': - starttime = self.since_date.replace('-', '') - endtime = self.end_date.replace('-', '') + since_date = self.since_date.split(' ')[0].split('-') + end_date = self.end_date.split(' ')[0].split('-') + for date in [since_date, end_date]: + for i in range(1, 2): + if len(date[i]) == 1: + date[i] = '0' + date[i] + starttime = ''.join(since_date) + endtime = ''.join(end_date) self.url = 'https://weibo.cn/%s/profile?starttime=%s&endtime=%s&advancedfilter=1&page=%d' % ( self.user_uri, starttime, endtime, page) self.selector = handle_html(self.cookie, self.url) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index e31262c1..df4ba4d1 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -252,6 +252,7 @@ def _get_config(): def main(_): try: config = _get_config() + config_util.validate_config(config) wb = Spider(config) wb.start() # 爬取微博信息 except Exception as e: From 4b1e783bed253ed065a4015871d785ac31f689a2 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Fri, 26 Jun 2020 18:46:35 +0800 Subject: [PATCH 235/363] Update FAQ.md --- docs/FAQ.md | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index 5ddd2ca7..74d56bcb 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -1,6 +1,12 @@ ## 常见问题 -### 1.程序运行出错,错误提示中包含“'NoneType' object”字样,如何解决? +### 1.程序运行出错,错误提示中包含“ImportError: cannot import name 'config_util' from '__main__'”,如何解决? +出现这种错误,说明使用者很可能是直接运行的.py文件,程序正确的运行方式是在weiboSpider目录下,运行如下命令: +``` +python3 -m weibo_spider +``` + +### 2.程序运行出错,错误提示中包含“'NoneType' object”字样,如何解决? 这是最常见的问题之一。出错原因是爬取速度太快,被暂时限制了。限制可能包含爬虫账号限制和ip限制。一般情况下,一段时间后限制会自动解除。可通过降低爬取速度避免被限制,具体修改weibo_spider.py文件中get_weibo_info方法的如下代码: ``` if (page - page1) % random_pages == 0 and page < page_num: @@ -29,19 +35,19 @@ ``` 上面的意思是每爬1到5个用户,随机等待6到10秒,你可以根据实际情况,修改代码中的数字。 -### 2.如何获取微博评论? +### 3.如何获取微博评论? 因为限制,只能获取一部分评论,无法获取全部,因此暂时没有添加获取评论功能的计划。 -### 3.有的长微博正文只能获取一部分内容,如何解决? +### 4.有的长微博正文只能获取一部分内容,如何解决? 程序是可以获取长微博全文的。程序首先在微博列表页获取微博,如果发现长微博(正文没有显示完整,以“全文”代替部分内容的微博),会先保存这个不全的内容,然后去该长微博的详情页尝试获取全文,如果获取成功,获取的内容就是微博文本;如果获取失败,等待若干秒重新获取;如果连续尝试5次都失败,就用上面不全的内容代替。这样做的原因是避免因部分长微博获取失败而卡住。如果想尝试更多次,可以修改comment_parser.py文件get_long_weibo方法内for循环的次数。 -### 4.如何按指定关键词获取微博? +### 5.如何按指定关键词获取微博? 请使用[weibo-search](https://github.com/dataabc/weibo-search)。该程序可以连续获取一个或多个微博关键词搜索结果,并将结果写入文件(可选)、数据库(可选)等。所谓微博关键词搜索即:搜索正文中包含指定关键词的微博,可以指定搜索的时间范围。对于非常热门的关键词,一天的时间范围,可以获得1000万以上的搜索结果,N天的时间范围就可以获得1000万 X N搜索结果。对于大多数关键词,一天产生的相应微博数量应该在1000万条以下,因此可以说该程序可以获得大部分关键词的全部或近似全部的搜索结果。而且该程序可以获得搜索结果的所有信息,本程序获得的微博信息该程序都能获得。 -### 5.如何获取微博用户关注列表中用户的user_id? +### 6.如何获取微博用户关注列表中用户的user_id? 请使用[weibo-follow](https://github.com/dataabc/weibo-follow)。该程序可以利用一个user_id,获取该user_id微博用户关注人的user_id,一个user_id最多可以获得200个user_id,并写入user_id_list.txt文件。程序支持读文件,利用这200个user_id,可以获得最多200X200=40000个user_id。再利用这40000个user_id可以得到40000X200=8000000个user_id,如此反复,以此类推,可以获得大量user_id。本项目也支持读文件,将上述程序的结果文件user_id_list.txt路径赋值给本项目config.json的user_id_list参数,就可以获得这些user_id用户所发布的大量微博。 -### 6.如何获取自己的微博? +### 7.如何获取自己的微博? 修改info_parser.py和page_parser.py中__init__方法,将前者的self.url修改为: ``` self.url = "https://weibo.cn/%s/profile" % (user_id) From 000f4788796e9fa5a1a85ed696b13482c03951f7 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Fri, 26 Jun 2020 18:54:50 +0800 Subject: [PATCH 236/363] Update example.md --- docs/example.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/example.md b/docs/example.md index e55ecac3..56c33fda 100644 --- a/docs/example.md +++ b/docs/example.md @@ -5,6 +5,7 @@ "user_id_list": ["1669879400"], "filter": 1, "since_date": "1900-01-01", + "end_date": "now", "write_mode": ["csv", "txt", "json"], "pic_download": 1, "video_download": 1, @@ -13,7 +14,7 @@ ``` 对于上述参数的含义以及取值范围,这里仅作简单介绍,详细信息见[程序设置](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md)。 ->**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](https://github.com/dataabc/weiboSpider/blob/master/docs/userid.md);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件、csv文件和json文件,所以它的值为["csv", "txt", "json"],如果你想写入数据库,具体设置见[设置数据库](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md#设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[cookie文档](https://github.com/dataabc/weiboSpider/blob/master/docs/cookie.md),获取cookie后把"your cookie"替换成真实的cookie值即可。
+>**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](https://github.com/dataabc/weiboSpider/blob/master/docs/userid.md);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**end_date**代表我们要爬取end_date日期之前发布的微博,since_date配合end_date,表示我们要爬取发布日期在since_date和end_date之间的微博,包含边界,如果end_date值为"now",表示爬取发布日期从since_date到现在的微博;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件、csv文件和json文件,所以它的值为["csv", "txt", "json"],如果你想写入数据库,具体设置见[设置数据库](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md#设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[cookie文档](https://github.com/dataabc/weiboSpider/blob/master/docs/cookie.md),获取cookie后把"your cookie"替换成真实的cookie值即可。
cookie修改完成后在weiboSpider目录下运行如下命令: ```bash From 80f4319d504df4a204df1db9b4e32cf1aecfd320 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Fri, 26 Jun 2020 19:12:00 +0800 Subject: [PATCH 237/363] Update settings.md --- docs/settings.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/settings.md b/docs/settings.md index 77e6eec3..fa232762 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -9,6 +9,7 @@ $ python3 -m weibo_spider "user_id_list": ["1669879400"], "filter": 1, "since_date": "2018-01-01", + "end_date": "now", "write_mode": ["csv", "txt"], "pic_download": 1, "video_download": 1, @@ -57,6 +58,8 @@ since_date值可以是日期,也可以是整数。如果是日期,代表爬 ``` 代表爬取最近10天的微博,这个说法不是特别准确,准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。
**since_date是所有user的爬取起始时间,非常不灵活。如果你要爬多个用户,并且想单独为每个用户设置一个since_date,可以使用[定期自动爬取微博](https://github.com/dataabc/weiboSpider/blob/master/docs/automation.md)方法二中的方法,该方法可以为多个用户设置不同的since_date,非常灵活。**
+**设置end_date**
+end_date值可以是日期,也可以是"now"。如果是日期,代表爬取该日期之前的微博,格式应为“yyyy-mm-dd”;如果是"now",代表爬取发布日期从since_date到现在的微博。since_date配合end_date,表示爬取发布日期在since_date和end_date之间的微博,包含边界。since_date是起始日期,end_date是结束日期,因此end_date时间应晚于since_date。注意,since_date即可以通过config.json文件的since_date参数设置,也可以通过user_id_list.txt设置;而end_date只能通过config.json文件的end_date参数设置,是全局变量,所有user_id都使用同一个end_date。
**设置write_mode**
write_mode控制结果文件格式,取值范围是csv、txt、json、mongo和mysql,分别代表将结果文件写入csv、txt、json、MongoDB和MySQL数据库。write_mode可以同时包含这些取值中的一个或几个,如: ``` From a4a65a210136455772f34552543451fc22f6c7c6 Mon Sep 17 00:00:00 2001 From: Mino <1227657064@qq.com> Date: Sat, 27 Jun 2020 21:40:34 +0800 Subject: [PATCH 238/363] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E8=BF=90=E8=A1=8C?= =?UTF-8?q?=E6=96=B9=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/__main__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/weibo_spider/__main__.py b/weibo_spider/__main__.py index 1961f82d..8050b179 100644 --- a/weibo_spider/__main__.py +++ b/weibo_spider/__main__.py @@ -1,5 +1,8 @@ from absl import app -from .spider import main +import sys,os +sys.path.append(os.path.abspath(os.path.dirname(os.getcwd()))) + +from weibo_spider.spider import main app.run(main) From 08ce0c08f99f1dca4acd395b55a944433903fe74 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sun, 28 Jun 2020 00:54:59 +0800 Subject: [PATCH 239/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8Duser=5Fid=5Fli?= =?UTF-8?q?st.txt=E8=B7=AF=E5=BE=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/config_util.py | 3 +-- weibo_spider/spider.py | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 49981fee..04acfb09 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -52,8 +52,7 @@ def validate_config(config): sys.exit(u"user_id_list值应为list类型或txt文件路径") if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): - user_id_list = (os.path.split(os.path.realpath(__file__))[0] + - os.sep + user_id_list) + user_id_list = os.getcwd() + os.sep + user_id_list if not os.path.isfile(user_id_list): sys.exit(u"不存在%s文件" % user_id_list) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index df4ba4d1..4ee3717d 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -48,10 +48,8 @@ def __init__(self, config): if not isinstance(user_id_list, list): if FLAGS.user_id_list is not None: user_id_list = FLAGS.user_id_list - else: + elif not os.path.isabs(user_id_list): user_id_list = os.getcwd() + os.sep + user_id_list - if not os.path.isfile(user_id_list): - sys.exit(u"当前路径:%s 不存在配置文件config.json" % user_id_list) self.user_config_file_path = user_id_list # 用户配置文件路径 user_config_list = config_util.get_user_config_list( user_id_list, self.since_date) From 2066ddbd6b576cc6e6107e38bfcd0aa614f8a396 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sat, 4 Jul 2020 17:40:14 +0800 Subject: [PATCH 240/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E6=97=B6?= =?UTF-8?q?=E9=97=B4=E6=A0=BC=E5=BC=8F=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/parser/page_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index 034c5c1c..b92081f3 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -25,7 +25,7 @@ def __init__(self, cookie, user_config, page, filter): since_date = self.since_date.split(' ')[0].split('-') end_date = self.end_date.split(' ')[0].split('-') for date in [since_date, end_date]: - for i in range(1, 2): + for i in [1, 2]: if len(date[i]) == 1: date[i] = '0' + date[i] starttime = ''.join(since_date) From c60d9d938ab48142b8b7979051886df76b13ff05 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sat, 4 Jul 2020 18:12:33 +0800 Subject: [PATCH 241/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E8=87=AA?= =?UTF-8?q?=E5=8A=A8=E6=9B=B4=E6=96=B0since=5Fdate=E6=97=B6=E6=96=B0?= =?UTF-8?q?=E5=80=BC=E4=B8=8D=E6=AD=A3=E7=A1=AE=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/spider.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 4ee3717d..603a7b56 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -64,7 +64,7 @@ def __init__(self, config): } for user_id in user_id_list] self.user_config_list = user_config_list # 要爬取的微博用户的user_config列表 self.user_config = {} # 用户配置,包含用户id和since_date - self.start_time = "" # 获取用户第一条微博时的时间 + self.new_since_date = '' # 完成某用户爬取后,自动生成对应用户新的since_date self.user = User() # 存储爬取到的用户信息 self.got_num = 0 # 存储爬取到的微博数 self.weibo_id_list = [] # 存储爬取到的所有微博id @@ -98,7 +98,11 @@ def get_weibo_info(self): self.user_config["user_uri"]).get_page_num() # 获取微博总页数 page1 = 0 random_pages = random.randint(1, 5) - self.start_time = datetime.now().strftime("%Y-%m-%d %H:%M") + if self.end_date == 'now': + self.new_since_date = datetime.now().strftime( + "%Y-%m-%d %H:%M") + else: + self.new_since_date = self.end_date for page in tqdm(range(1, page_num + 1), desc="Progress"): weibos, self.weibo_id_list = PageParser( self.cookie, @@ -219,7 +223,7 @@ def start(self): self.user_config_file_path, self.user_config["user_uri"], self.user.nickname, - self.start_time, + self.new_since_date, ) except Exception as e: print("Error: ", e) From 2a43116fbd1b9628bf7e1720477b76f6caca6523 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sat, 4 Jul 2020 20:00:24 +0800 Subject: [PATCH 242/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E4=BB=A5?= =?UTF-8?q?=E5=91=BD=E4=BB=A4=E8=A1=8C=E5=8F=82=E6=95=B0=E7=9A=84=E5=BD=A2?= =?UTF-8?q?=E5=BC=8F=E8=BE=93=E5=85=A5user=5Fid=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 命令行 python3 -m weibo_spider -u 1669879400,若想输入多个user_id,每个user_id要用英文逗号连接。该方式的所有user_id使用config.json中的since_date和end_date设置,通过修改它们的值可以控制爬取的时间范围。若user_id_list是文件路径,每个user_id都会自动保存到该文件内,且自动更新since_date;若不是路径,user_id会保存在当前目录的user_id_list.txt内,且自动更新since_date,若user_id_list.txt文件不存在,程序会自动创建它。 Issue #180 --- weibo_spider/config_util.py | 72 +++++++++++--------- weibo_spider/spider.py | 130 +++++++++++++++++++----------------- 2 files changed, 112 insertions(+), 90 deletions(-) diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 04acfb09..71d04706 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -20,64 +20,64 @@ def validate_config(config): """验证配置是否正确""" # 验证filter、pic_download、video_download - argument_list = ["filter", "pic_download", "video_download"] + argument_list = ['filter', 'pic_download', 'video_download'] for argument in argument_list: if config[argument] != 0 and config[argument] != 1: - sys.exit(u"%s值应为0或1,请重新输入" % config[argument]) + sys.exit(u'%s值应为0或1,请重新输入' % config[argument]) # 验证since_date - since_date = str(config["since_date"]) + since_date = str(config['since_date']) if (not _is_date(since_date)) and (not since_date.isdigit()): - sys.exit(u"since_date值应为yyyy-mm-dd形式或整数,请重新输入") + sys.exit(u'since_date值应为yyyy-mm-dd形式或整数,请重新输入') # 验证end_date - end_date = str(config["end_date"]) + end_date = str(config['end_date']) if (not _is_date(end_date)) and (end_date != 'now'): sys.exit(u'end_date值应为yyyy-mm-dd形式或"now",请重新输入') # 验证write_mode - write_mode = ["txt", "csv", "json", "mongo", "mysql"] - if not isinstance(config["write_mode"], list): - sys.exit(u"write_mode值应为list类型") - for mode in config["write_mode"]: + write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql'] + if not isinstance(config['write_mode'], list): + sys.exit(u'write_mode值应为list类型') + for mode in config['write_mode']: if mode not in write_mode: sys.exit( - u"%s为无效模式,请从txt、csv、json、mongo和mysql中挑选一个或多个作为write_mode" % + u'%s为无效模式,请从txt、csv、json、mongo和mysql中挑选一个或多个作为write_mode' % mode) # 验证user_id_list - user_id_list = config["user_id_list"] + user_id_list = config['user_id_list'] if (not isinstance(user_id_list, - list)) and (not user_id_list.endswith(".txt")): - sys.exit(u"user_id_list值应为list类型或txt文件路径") + list)) and (not user_id_list.endswith('.txt')): + sys.exit(u'user_id_list值应为list类型或txt文件路径') if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): user_id_list = os.getcwd() + os.sep + user_id_list if not os.path.isfile(user_id_list): - sys.exit(u"不存在%s文件" % user_id_list) + sys.exit(u'不存在%s文件' % user_id_list) def get_user_config_list(file_name, default_since_date): """获取文件中的微博id信息""" - with open(file_name, "rb") as f: + with open(file_name, 'rb') as f: try: lines = f.read().splitlines() - lines = [line.decode("utf-8-sig") for line in lines] + lines = [line.decode('utf-8-sig') for line in lines] except UnicodeDecodeError: - sys.exit(u"%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序" % file_name) + sys.exit(u'%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序' % file_name) user_config_list = [] for line in lines: - info = line.split(" ") + info = line.split(' ') if len(info) > 0 and info[0].isdigit(): user_config = {} - user_config["user_uri"] = info[0] + user_config['user_uri'] = info[0] if len(info) > 2 and _is_date(info[2]): - if len(info) > 3 and _is_date(info[2] + " " + info[3]): - user_config["since_date"] = info[2] + " " + info[3] + if len(info) > 3 and _is_date(info[2] + ' ' + info[3]): + user_config['since_date'] = info[2] + ' ' + info[3] else: - user_config["since_date"] = info[2] + user_config['since_date'] = info[2] else: - user_config["since_date"] = default_since_date + user_config['since_date'] = default_since_date if user_config not in user_config_list: user_config_list.append(user_config) return user_config_list @@ -86,11 +86,13 @@ def get_user_config_list(file_name, default_since_date): def update_user_config_file(user_config_file_path, user_uri, nickname, start_time): """更新用户配置文件""" - with open(user_config_file_path, "rb") as f: + if not user_config_file_path: + user_config_file_path = os.getcwd() + os.sep + 'user_id_list.txt' + with open(user_config_file_path, 'rb') as f: lines = f.read().splitlines() - lines = [line.decode("utf-8-sig") for line in lines] + lines = [line.decode('utf-8-sig') for line in lines] for i, line in enumerate(lines): - info = line.split(" ") + info = line.split(' ') if len(info) > 0: if user_uri == info[0]: if len(info) == 1: @@ -98,11 +100,21 @@ def update_user_config_file(user_config_file_path, user_uri, nickname, info.append(start_time) if len(info) == 2: info.append(start_time) - if len(info) > 3 and _is_date(info[2] + " " + info[3]): + if len(info) > 3 and _is_date(info[2] + ' ' + info[3]): del info[3] if len(info) > 2: info[2] = start_time - lines[i] = " ".join(info) + lines[i] = ' '.join(info) break - with codecs.open(user_config_file_path, "w", encoding="utf-8") as f: - f.write("\n".join(lines)) + with codecs.open(user_config_file_path, 'w', encoding='utf-8') as f: + f.write('\n'.join(lines)) + + +def add_user_uri_list(user_config_file_path, user_uri_list): + """向user_id_list.txt文件添加若干user_uri""" + if not user_config_file_path: + user_config_file_path = os.getcwd() + os.sep + 'user_id_list.txt' + if os.path.isfile(user_config_file_path): + user_uri_list[0] = '\n' + user_uri_list[0] + with codecs.open(user_config_file_path, 'a', encoding='utf-8') as f: + f.write('\n'.join(user_uri_list)) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 603a7b56..7023e09e 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -19,49 +19,59 @@ FLAGS = flags.FLAGS -flags.DEFINE_string("config_path", None, "The path to config.json.") -flags.DEFINE_string("user_id_list", None, "The path to user_id_list.txt.") -flags.DEFINE_string("output_dir", None, "The dir path to store results.") +flags.DEFINE_string('config_path', None, 'The path to config.json.') +flags.DEFINE_string('u', None, 'The user_id we want to input.') +flags.DEFINE_string('user_id_list', None, 'The path to user_id_list.txt.') +flags.DEFINE_string('output_dir', None, 'The dir path to store results.') class Spider: def __init__(self, config): """Weibo类初始化""" self.filter = config[ - "filter"] # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 - since_date = str(config["since_date"]) + 'filter'] # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 + since_date = str(config['since_date']) if since_date.isdigit(): since_date = str(date.today() - timedelta(int(since_date))) self.since_date = since_date # 起始时间,即爬取发布日期从该值到结束时间的微博,形式为yyyy-mm-dd self.end_date = config[ 'end_date'] # 结束时间,即爬取发布日期从起始时间到该值的微博,形式为yyyy-mm-dd,特殊值"now"代表现在 self.write_mode = config[ - "write_mode"] # 结果信息保存类型,为list形式,可包含txt、csv、json、mongo和mysql五种类型 + 'write_mode'] # 结果信息保存类型,为list形式,可包含txt、csv、json、mongo和mysql五种类型 self.pic_download = config[ - "pic_download"] # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 + 'pic_download'] # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 self.video_download = config[ - "video_download"] # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 - self.cookie = {"Cookie": config["cookie"]} - self.mysql_config = config.get("mysql_config") # MySQL数据库连接配置,可以不填 - - user_id_list = config["user_id_list"] + 'video_download'] # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 + self.cookie = {'Cookie': config['cookie']} + self.mysql_config = config.get('mysql_config') # MySQL数据库连接配置,可以不填 + + self.user_config_file_path = '' + user_id_list = config['user_id_list'] + if FLAGS.user_id_list: + user_id_list = FLAGS.user_id_list if not isinstance(user_id_list, list): - if FLAGS.user_id_list is not None: - user_id_list = FLAGS.user_id_list - elif not os.path.isabs(user_id_list): + if not os.path.isabs(user_id_list): user_id_list = os.getcwd() + os.sep + user_id_list - self.user_config_file_path = user_id_list # 用户配置文件路径 + if not os.path.isfile(user_id_list): + sys.exit(u'不存在%s文件' % user_id_list) + self.user_config_file_path = user_id_list + if FLAGS.u: + user_id_list = FLAGS.u.split(',') + if isinstance(user_id_list, list): + user_id_list = list(set(user_id_list)) + user_config_list = [{ + 'user_uri': user_id, + 'since_date': self.since_date, + 'end_date': self.end_date + } for user_id in user_id_list] + if FLAGS.u: + config_util.add_user_uri_list(self.user_config_file_path, + user_id_list) + else: user_config_list = config_util.get_user_config_list( user_id_list, self.since_date) for user_config in user_config_list: user_config['end_date'] = self.end_date - else: - self.user_config_file_path = "" - user_config_list = [{ - "user_uri": user_id, - "since_date": self.since_date, - "end_date": self.end_date - } for user_id in user_id_list] self.user_config_list = user_config_list # 要爬取的微博用户的user_config列表 self.user_config = {} # 用户配置,包含用户id和since_date self.new_since_date = '' # 完成某用户爬取后,自动生成对应用户新的since_date @@ -89,31 +99,31 @@ def get_weibo_info(self): """获取微博信息""" try: since_date = datetime_util.str_to_time( - self.user_config["since_date"]) - now = datetime.now().strftime("%Y-%m-%d %H:%M") - now = datetime.strptime(now, "%Y-%m-%d %H:%M") + self.user_config['since_date']) + now = datetime.now().strftime('%Y-%m-%d %H:%M') + now = datetime.strptime(now, '%Y-%m-%d %H:%M') if since_date <= now: page_num = IndexParser( self.cookie, - self.user_config["user_uri"]).get_page_num() # 获取微博总页数 + self.user_config['user_uri']).get_page_num() # 获取微博总页数 page1 = 0 random_pages = random.randint(1, 5) if self.end_date == 'now': self.new_since_date = datetime.now().strftime( - "%Y-%m-%d %H:%M") + '%Y-%m-%d %H:%M') else: self.new_since_date = self.end_date - for page in tqdm(range(1, page_num + 1), desc="Progress"): + for page in tqdm(range(1, page_num + 1), desc='Progress'): weibos, self.weibo_id_list = PageParser( self.cookie, self.user_config, page, self.filter).get_one_page( self.weibo_id_list) # 获取第page页的全部微博 - print(u"{}已获取{}({})的第{}页微博{}".format( - "-" * 30, + print(u'{}已获取{}({})的第{}页微博{}'.format( + '-' * 30, self.user.nickname, self.user.id, page, - "-" * 30, + '-' * 30, )) if weibos: yield weibos @@ -128,7 +138,7 @@ def get_weibo_info(self): page1 = page random_pages = random.randint(1, 5) except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def _get_filepath(self, type): @@ -137,18 +147,18 @@ def _get_filepath(self, type): if FLAGS.output_dir is not None: file_dir = FLAGS.output_dir else: - file_dir = (os.getcwd() + os.sep + "weibo" + os.sep + + file_dir = (os.getcwd() + os.sep + 'weibo' + os.sep + self.user.nickname) - if type == "img" or type == "video": + if type == 'img' or type == 'video': file_dir = file_dir + os.sep + type if not os.path.isdir(file_dir): os.makedirs(file_dir) - if type == "img" or type == "video": + if type == 'img' or type == 'video': return file_dir - file_path = file_dir + os.sep + self.user.id + "." + type + file_path = file_dir + os.sep + self.user.id + '.' + type return file_path except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def initialize_info(self, user_config): @@ -158,25 +168,25 @@ def initialize_info(self, user_config): self.weibo_id_list = [] self.writers = [] - if "csv" in self.write_mode: + if 'csv' in self.write_mode: from .writer import CsvWriter self.writers.append( - CsvWriter(self._get_filepath("csv"), self.filter)) - if "txt" in self.write_mode: + CsvWriter(self._get_filepath('csv'), self.filter)) + if 'txt' in self.write_mode: from .writer import TxtWriter self.writers.append( - TxtWriter(self._get_filepath("txt"), self.filter)) - if "json" in self.write_mode: + TxtWriter(self._get_filepath('txt'), self.filter)) + if 'json' in self.write_mode: from .writer import JsonWriter - self.writers.append(JsonWriter(self._get_filepath("json"))) - if "mysql" in self.write_mode: + self.writers.append(JsonWriter(self._get_filepath('json'))) + if 'mysql' in self.write_mode: from .writer import MySqlWriter self.writers.append(MySqlWriter(self.mysql_config)) - if "mongo" in self.write_mode: + if 'mongo' in self.write_mode: from .writer import MongoWriter self.writers.append(MongoWriter()) @@ -185,12 +195,12 @@ def initialize_info(self, user_config): if self.pic_download == 1: from .downloader import ImgDownloader - self.downloaders.append(ImgDownloader(self._get_filepath("img"))) + self.downloaders.append(ImgDownloader(self._get_filepath('img'))) if self.video_download == 1: from .downloader import VideoDownloader self.downloaders.append( - VideoDownloader(self._get_filepath("video"))) + VideoDownloader(self._get_filepath('video'))) def start(self): """运行爬虫""" @@ -200,33 +210,33 @@ def start(self): u'没有配置有效的user_id,请通过config.json或user_id_list.txt配置user_id') return for user_config in self.user_config_list: - self.get_user_info(user_config["user_uri"]) + self.get_user_info(user_config['user_uri']) print(self.user) - print("*" * 100) + print('*' * 100) self.initialize_info(user_config) self.write_user(self.user) - print("*" * 100) + print('*' * 100) for weibos in self.get_weibo_info(): self.write_weibo(weibos) self.got_num += len(weibos) if not self.filter: - print(u"共爬取" + str(self.got_num) + u"条微博") + print(u'共爬取' + str(self.got_num) + u'条微博') else: - print(u"共爬取" + str(self.got_num) + u"条原创微博") - print(u"信息抓取完毕") - print("*" * 100) + print(u'共爬取' + str(self.got_num) + u'条原创微博') + print(u'信息抓取完毕') + print('*' * 100) - if self.user_config_file_path: + if self.user_config_file_path or FLAGS.u: config_util.update_user_config_file( self.user_config_file_path, - self.user_config["user_uri"], + self.user_config['user_uri'], self.user.nickname, self.new_since_date, ) except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() @@ -258,9 +268,9 @@ def main(_): wb = Spider(config) wb.start() # 爬取微博信息 except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() -if __name__ == "__main__": +if __name__ == '__main__': app.run(main) From 182565c33f189bf5adc7a43971f2313deaea18fb Mon Sep 17 00:00:00 2001 From: dataabc Date: Sun, 5 Jul 2020 17:56:58 +0800 Subject: [PATCH 243/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=BD=93end?= =?UTF-8?q?=5Fdate=E5=80=BC=E9=9D=9Enow=E6=97=B6=E6=97=A0=E6=B3=95?= =?UTF-8?q?=E8=8E=B7=E5=8F=96=E5=9B=BE=E7=89=87=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/parser/page_parser.py | 218 ++++++++++++++--------------- weibo_spider/parser/util.py | 2 +- 2 files changed, 110 insertions(+), 110 deletions(-) diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index b92081f3..aa4e6cc4 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -20,7 +20,7 @@ def __init__(self, cookie, user_config, page, filter): self.since_date = user_config['since_date'] self.end_date = user_config['end_date'] self.page = page - self.url = "https://weibo.cn/%s?page=%d" % (self.user_uri, page) + self.url = 'https://weibo.cn/%s?page=%d' % (self.user_uri, page) if self.end_date != 'now': since_date = self.since_date.split(' ')[0].split('-') end_date = self.end_date.split(' ')[0].split('-') @@ -57,12 +57,12 @@ def get_one_page(self, weibo_id_list): else: return weibos, weibo_id_list print(weibo) - print("-" * 100) + print('-' * 100) weibos.append(weibo) weibo_id_list.append(weibo.id) return weibos, weibo_id_list except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def is_original(self, info): @@ -77,93 +77,93 @@ def get_original_weibo(self, info, weibo_id): """获取原创微博""" try: weibo_content = handle_garbled(info) - weibo_content = weibo_content[:weibo_content.rfind(u"赞")] - a_text = info.xpath("div//a/text()") - if u"全文" in a_text: + weibo_content = weibo_content[:weibo_content.rfind(u'赞')] + a_text = info.xpath('div//a/text()') + if u'全文' in a_text: wb_content = CommentParser(self.cookie, weibo_id).get_long_weibo() if wb_content: weibo_content = wb_content return weibo_content except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_retweet(self, info, weibo_id): """获取转发微博""" try: weibo_content = handle_garbled(info) - weibo_content = weibo_content[weibo_content.find(":") + - 1:weibo_content.rfind(u"赞")] - weibo_content = weibo_content[:weibo_content.rfind(u"赞")] - a_text = info.xpath("div//a/text()") - if u"全文" in a_text: + weibo_content = weibo_content[weibo_content.find(':') + + 1:weibo_content.rfind(u'赞')] + weibo_content = weibo_content[:weibo_content.rfind(u'赞')] + a_text = info.xpath('div//a/text()') + if u'全文' in a_text: wb_content = CommentParser(self.cookie, weibo_id).get_long_retweet() if wb_content: weibo_content = wb_content - retweet_reason = handle_garbled(info.xpath("div")[-1]) - retweet_reason = retweet_reason[:retweet_reason.rindex(u"赞")] + retweet_reason = handle_garbled(info.xpath('div')[-1]) + retweet_reason = retweet_reason[:retweet_reason.rindex(u'赞')] original_user = info.xpath("div/span[@class='cmt']/a/text()") if original_user: original_user = original_user[0] - weibo_content = (retweet_reason + "\n" + u"原始用户: " + - original_user + "\n" + u"转发内容: " + + weibo_content = (retweet_reason + '\n' + u'原始用户: ' + + original_user + '\n' + u'转发内容: ' + weibo_content) else: - weibo_content = (retweet_reason + "\n" + u"转发内容: " + + weibo_content = (retweet_reason + '\n' + u'转发内容: ' + weibo_content) return weibo_content except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_weibo_content(self, info, is_original): """获取微博内容""" try: - weibo_id = info.xpath("@id")[0][2:] + weibo_id = info.xpath('@id')[0][2:] if is_original: weibo_content = self.get_original_weibo(info, weibo_id) else: weibo_content = self.get_retweet(info, weibo_id) return weibo_content except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_article_url(self, info): """获取微博头条文章的url""" - article_url = "" + article_url = '' text = handle_garbled(info) - if text.startswith(u"发布了头条文章"): - url = info.xpath(".//a/@href") - if url and url[0].startswith("https://weibo.cn/sinaurl"): + if text.startswith(u'发布了头条文章'): + url = info.xpath('.//a/@href') + if url and url[0].startswith('https://weibo.cn/sinaurl'): article_url = url[0] return article_url def get_publish_place(self, info): """获取微博发布位置""" try: - div_first = info.xpath("div")[0] - a_list = div_first.xpath("a") - publish_place = u"无" + div_first = info.xpath('div')[0] + a_list = div_first.xpath('a') + publish_place = u'无' for a in a_list: - if ("place.weibo.com" in a.xpath("@href")[0] - and a.xpath("text()")[0] == u"显示地图"): + if ('place.weibo.com' in a.xpath('@href')[0] + and a.xpath('text()')[0] == u'显示地图'): weibo_a = div_first.xpath("span[@class='ctt']/a") if len(weibo_a) >= 1: publish_place = weibo_a[-1] - if (u"视频" == div_first.xpath( + if (u'视频' == div_first.xpath( "span[@class='ctt']/a/text()")[-1][-2:]): if len(weibo_a) >= 2: publish_place = weibo_a[-2] else: - publish_place = u"无" + publish_place = u'无' publish_place = handle_garbled(publish_place) break return publish_place except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_publish_time(self, info): @@ -171,31 +171,31 @@ def get_publish_time(self, info): try: str_time = info.xpath("div/span[@class='ct']") str_time = handle_garbled(str_time[0]) - publish_time = str_time.split(u"来自")[0] - if u"刚刚" in publish_time: - publish_time = datetime.now().strftime("%Y-%m-%d %H:%M") - elif u"分钟" in publish_time: - minute = publish_time[:publish_time.find(u"分钟")] + publish_time = str_time.split(u'来自')[0] + if u'刚刚' in publish_time: + publish_time = datetime.now().strftime('%Y-%m-%d %H:%M') + elif u'分钟' in publish_time: + minute = publish_time[:publish_time.find(u'分钟')] minute = timedelta(minutes=int(minute)) publish_time = (datetime.now() - - minute).strftime("%Y-%m-%d %H:%M") - elif u"今天" in publish_time: - today = datetime.now().strftime("%Y-%m-%d") + minute).strftime('%Y-%m-%d %H:%M') + elif u'今天' in publish_time: + today = datetime.now().strftime('%Y-%m-%d') time = publish_time[3:] - publish_time = today + " " + time + publish_time = today + ' ' + time if len(publish_time) > 16: publish_time = publish_time[:16] - elif u"月" in publish_time: - year = datetime.now().strftime("%Y") + elif u'月' in publish_time: + year = datetime.now().strftime('%Y') month = publish_time[0:2] day = publish_time[3:5] time = publish_time[7:12] - publish_time = year + "-" + month + "-" + day + " " + time + publish_time = year + '-' + month + '-' + day + ' ' + time else: publish_time = publish_time[:16] return publish_time except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_publish_tool(self, info): @@ -203,100 +203,100 @@ def get_publish_tool(self, info): try: str_time = info.xpath("div/span[@class='ct']") str_time = handle_garbled(str_time[0]) - if len(str_time.split(u"来自")) > 1: - publish_tool = str_time.split(u"来自")[1] + if len(str_time.split(u'来自')) > 1: + publish_tool = str_time.split(u'来自')[1] else: - publish_tool = u"无" + publish_tool = u'无' return publish_tool except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_weibo_footer(self, info): """获取微博点赞数、转发数、评论数""" try: footer = {} - pattern = r"\d+" - str_footer = info.xpath("div")[-1] + pattern = r'\d+' + str_footer = info.xpath('div')[-1] str_footer = handle_garbled(str_footer) - str_footer = str_footer[str_footer.rfind(u"赞"):] + str_footer = str_footer[str_footer.rfind(u'赞'):] weibo_footer = re.findall(pattern, str_footer, re.M) up_num = int(weibo_footer[0]) - footer["up_num"] = up_num + footer['up_num'] = up_num retweet_num = int(weibo_footer[1]) - footer["retweet_num"] = retweet_num + footer['retweet_num'] = retweet_num comment_num = int(weibo_footer[2]) - footer["comment_num"] = comment_num + footer['comment_num'] = comment_num return footer except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_picture_urls(self, info, is_original): """获取微博原始图片url""" try: - weibo_id = info.xpath("@id")[0][2:] + weibo_id = info.xpath('@id')[0][2:] picture_urls = {} if is_original: original_pictures = self.extract_picture_urls(info, weibo_id) - picture_urls["original_pictures"] = original_pictures + picture_urls['original_pictures'] = original_pictures if not self.filter: - picture_urls["retweet_pictures"] = u"无" + picture_urls['retweet_pictures'] = u'无' else: retweet_url = info.xpath("div/a[@class='cc']/@href")[0] - retweet_id = retweet_url.split("/")[-1].split("?")[0] + retweet_id = retweet_url.split('/')[-1].split('?')[0] retweet_pictures = self.extract_picture_urls(info, retweet_id) - picture_urls["retweet_pictures"] = retweet_pictures - a_list = info.xpath("div[last()]/a/@href") - original_picture = u"无" + picture_urls['retweet_pictures'] = retweet_pictures + a_list = info.xpath('div[last()]/a/@href') + original_picture = u'无' for a in a_list: - if a.endswith((".gif", ".jpeg", ".jpg", ".png")): + if a.endswith(('.gif', '.jpeg', '.jpg', '.png')): original_picture = a break - picture_urls["original_pictures"] = original_picture + picture_urls['original_pictures'] = original_picture return picture_urls except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def get_video_url(self, info, is_original): """获取微博视频url""" try: if is_original: - div_first = info.xpath("div")[0] - a_list = div_first.xpath(".//a") - video_link = u"无" + div_first = info.xpath('div')[0] + a_list = div_first.xpath('.//a') + video_link = u'无' for a in a_list: - if "m.weibo.cn/s/video/show?object_id=" in a.xpath( - "@href")[0]: - video_link = a.xpath("@href")[0] + if 'm.weibo.cn/s/video/show?object_id=' in a.xpath( + '@href')[0]: + video_link = a.xpath('@href')[0] break - if video_link != u"无": + if video_link != u'无': video_link = video_link.replace( - "m.weibo.cn/s/video/show", "m.weibo.cn/s/video/object") + 'm.weibo.cn/s/video/show', 'm.weibo.cn/s/video/object') wb_info = requests.get(video_link, cookies=self.cookie).json() - video_url = wb_info["data"]["object"]["stream"].get( - "hd_url") + video_url = wb_info['data']['object']['stream'].get( + 'hd_url') if not video_url: - video_url = wb_info["data"]["object"]["stream"]["url"] + video_url = wb_info['data']['object']['stream']['url'] if not video_url: # 说明该视频为直播 - video_url = u"无" + video_url = u'无' else: - video_url = u"无" + video_url = u'无' return video_url except Exception as e: - return u"无" - print("Error: ", e) + return u'无' + print('Error: ', e) traceback.print_exc() def is_pinned_weibo(self, info): """判断微博是否为置顶微博""" kt = info.xpath(".//span[@class='kt']/text()") - if kt and kt[0] == u"置顶": + if kt and kt[0] == u'置顶': return True else: return False @@ -307,16 +307,16 @@ def get_one_weibo(self, info): weibo = Weibo() is_original = self.is_original(info) if (not self.filter) or is_original: - weibo.id = info.xpath("@id")[0][2:] + weibo.id = info.xpath('@id')[0][2:] weibo.content = self.get_weibo_content(info, is_original) # 微博内容 weibo.article_url = self.get_article_url(info) # 头条文章url picture_urls = self.get_picture_urls(info, is_original) weibo.original_pictures = picture_urls[ - "original_pictures"] # 原创图片url + 'original_pictures'] # 原创图片url if not self.filter: weibo.retweet_pictures = picture_urls[ - "retweet_pictures"] # 转发图片url + 'retweet_pictures'] # 转发图片url weibo.original = is_original # 是否原创微博 weibo.video_url = self.get_video_url(info, is_original) # 微博视频url @@ -324,51 +324,51 @@ def get_one_weibo(self, info): weibo.publish_time = self.get_publish_time(info) # 微博发布时间 weibo.publish_tool = self.get_publish_tool(info) # 微博发布工具 footer = self.get_weibo_footer(info) - weibo.up_num = footer["up_num"] # 微博点赞数 - weibo.retweet_num = footer["retweet_num"] # 转发数 - weibo.comment_num = footer["comment_num"] # 评论数 + weibo.up_num = footer['up_num'] # 微博点赞数 + weibo.retweet_num = footer['retweet_num'] # 转发数 + weibo.comment_num = footer['comment_num'] # 评论数 else: weibo = None - print(u"正在过滤转发微博") + print(u'正在过滤转发微博') return weibo except Exception as e: - print("Error: ", e) + print('Error: ', e) traceback.print_exc() def extract_picture_urls(self, info, weibo_id): """提取微博原始图片url""" try: - a_list = info.xpath("div/a/@href") - first_pic = "https://weibo.cn/mblog/pic/" + weibo_id + "?rl=0" - all_pic = "https://weibo.cn/mblog/picAll/" + weibo_id + "?rl=1" - picture_urls = u"无" - if first_pic in a_list: - if all_pic in a_list: + a_list = info.xpath('div/a/@href') + first_pic = 'https://weibo.cn/mblog/pic/' + weibo_id + all_pic = 'https://weibo.cn/mblog/picAll/' + weibo_id + picture_urls = u'无' + if first_pic in ''.join(a_list): + if all_pic in ''.join(a_list): preview_picture_list = MblogPicAllParser( self.cookie, weibo_id).extract_preview_picture_list() picture_list = [ - p.replace("/thumb180/", "/large/") + p.replace('/thumb180/', '/large/') for p in preview_picture_list ] - picture_urls = ",".join(picture_list) + picture_urls = ','.join(picture_list) else: - if info.xpath(".//img/@src"): - for link in info.xpath("div/a"): - if len(link.xpath("@href")) > 0: - if first_pic == link.xpath("@href")[0]: - if len(link.xpath("img/@src")) > 0: + if info.xpath('.//img/@src'): + for link in info.xpath('div/a'): + if len(link.xpath('@href')) > 0: + if first_pic in link.xpath('@href')[0]: + if len(link.xpath('img/@src')) > 0: preview_picture = link.xpath( - "img/@src")[0] + 'img/@src')[0] picture_urls = preview_picture.replace( - "/wap180/", "/large/") + '/wap180/', '/large/') break else: sys.exit( - u"爬虫微博可能被设置成了'不显示图片',请前往" - u"'https://weibo.cn/account/customize/pic',修改为'显示'" + u'爬虫微博可能被设置成了"不显示图片",请前往' + u'"https://weibo.cn/account/customize/pic",修改为"显示"' ) return picture_urls except Exception as e: - return u"无" - print("Error: ", e) + return u'无' + print('Error: ', e) traceback.print_exc() diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py index 3724c83e..e311b02e 100644 --- a/weibo_spider/parser/util.py +++ b/weibo_spider/parser/util.py @@ -1,6 +1,6 @@ +import hashlib import sys import traceback -import hashlib import requests from lxml import etree From 2b34e32a30b3fc6744954b9c3e774c721c7e2cc6 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sun, 5 Jul 2020 18:26:19 +0800 Subject: [PATCH 244/363] Update README.md --- README.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 90eac211..49a6f54e 100644 --- a/README.md +++ b/README.md @@ -136,12 +136,24 @@ $ python3 -m pip install weibo-spider $ python3 -m weibo_spider ``` -第一次执行,会自动在当前目录创建config.json配置文件,配置好后执行同样的命令就可以获取微博了。如果你已经有config.json文件了,也可以通过config_path参数配置config.json路径,运行程序,命令行如下: +第一次执行,会自动在当前目录创建config.json配置文件,配置好后执行同样的命令就可以获取微博了。 + +如果你已经有config.json文件了,也可以通过config_path参数配置config.json路径,运行程序,命令行如下: ```bash $ python3 -m weibo_spider --config_path="config.json" ``` +如果你想指定文件(csv、txt、json、图片、视频)保存路径,可以通过output_dir参数设定。假如你想把文件保存到/home/weibo/目录,可以运行如下命令: +``` +$ python3 -m weibo_spider --output_dir="/home/weibo/" +``` +如果你想通过命令行输入user_id,可以使用参数u,可以输入一个或多个user_id,每个user_id以英文逗号分开,如果这些user_id中有重复的user_id,程序会自动去重。命令行如下: +``` +$ python3 -m weibo_spider --u="1669879400,1223178222" +``` +程序会获取user_id分别为1669879400和1223178222的微博用户的微博,后面会讲[如何获取user_id](#如何获取user_id)。该方式的所有user_id使用config.json中的since_date和end_date设置,通过修改它们的值可以控制爬取的时间范围。若config.json中的user_id_list是文件路径,每个命令行中的user_id都会自动保存到该文件内,且自动更新since_date;若不是路径,user_id会保存在当前目录的user_id_list.txt内,且自动更新since_date,若当前目录下不存在user_id_list.txt,程序会自动创建它。 + ## 个性化定制程序(可选) 本部分为可选部分,如果不需要个性化定制程序或添加新功能,可以忽略此部分。 From 347d0a53a1fd96a7f738c64d26594126283eedb9 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sun, 5 Jul 2020 18:34:25 +0800 Subject: [PATCH 245/363] =?UTF-8?q?per:=20=E4=BC=98=E5=8C=96=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E5=86=99=E5=85=A5=E8=B7=AF=E5=BE=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #186 --- weibo_spider/spider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 7023e09e..3eed2607 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -145,7 +145,7 @@ def _get_filepath(self, type): """获取结果文件路径""" try: if FLAGS.output_dir is not None: - file_dir = FLAGS.output_dir + file_dir = FLAGS.output_dir + os.sep + self.user.nickname else: file_dir = (os.getcwd() + os.sep + 'weibo' + os.sep + self.user.nickname) From 54e9fd2478343bcbcd9bfe8381304be444031ed6 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sun, 5 Jul 2020 18:43:32 +0800 Subject: [PATCH 246/363] Update settings.md --- docs/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/settings.md b/docs/settings.md index fa232762..0d0fa389 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -59,7 +59,7 @@ since_date值可以是日期,也可以是整数。如果是日期,代表爬 代表爬取最近10天的微博,这个说法不是特别准确,准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。
**since_date是所有user的爬取起始时间,非常不灵活。如果你要爬多个用户,并且想单独为每个用户设置一个since_date,可以使用[定期自动爬取微博](https://github.com/dataabc/weiboSpider/blob/master/docs/automation.md)方法二中的方法,该方法可以为多个用户设置不同的since_date,非常灵活。**
**设置end_date**
-end_date值可以是日期,也可以是"now"。如果是日期,代表爬取该日期之前的微博,格式应为“yyyy-mm-dd”;如果是"now",代表爬取发布日期从since_date到现在的微博。since_date配合end_date,表示爬取发布日期在since_date和end_date之间的微博,包含边界。since_date是起始日期,end_date是结束日期,因此end_date时间应晚于since_date。注意,since_date即可以通过config.json文件的since_date参数设置,也可以通过user_id_list.txt设置;而end_date只能通过config.json文件的end_date参数设置,是全局变量,所有user_id都使用同一个end_date。
+end_date值可以是日期,也可以是"now"。如果是日期,代表爬取该日期之前的微博,格式应为“yyyy-mm-dd”;如果是"now",代表爬取发布日期从since_date到现在的微博。since_date配合end_date,表示爬取发布日期在since_date和end_date之间的微博,包含边界。since_date是起始日期,end_date是结束日期,因此end_date时间应晚于since_date。注意,since_date即可以通过config.json文件的since_date参数设置,也可以通过user_id_list.txt设置;而end_date只能通过config.json文件的end_date参数设置,是全局变量,所有user_id都使用同一个end_date。当end_date值不是"now"时,程序无法获取微博中的视频,如果想要获取视频,请为end_date赋值为"now"。
**设置write_mode**
write_mode控制结果文件格式,取值范围是csv、txt、json、mongo和mysql,分别代表将结果文件写入csv、txt、json、MongoDB和MySQL数据库。write_mode可以同时包含这些取值中的一个或几个,如: ``` From 141d23d0dacd92c72f2668797eac94824fab7501 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sat, 11 Jul 2020 18:17:33 +0800 Subject: [PATCH 247/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E6=97=A5?= =?UTF-8?q?=E5=BF=97=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 可以通过logging.conf修改日志配置,本程序日志包括三个功能:1.在命令行输出DEBUG级别及以上的信息;2.在all.log文件写入INFO级别及以上的信息;3.在error.log写入WARNING级别及以上的信息。 Issue #161 --- weibo_spider/__main__.py | 6 +-- weibo_spider/config_util.py | 29 +++++++---- weibo_spider/downloader/downloader.py | 39 +++++++------- weibo_spider/downloader/img_downloader.py | 22 ++++---- weibo_spider/downloader/video_downloader.py | 10 ++-- weibo_spider/logging.conf | 45 +++++++++++++++++ weibo_spider/parser/comment_parser.py | 22 ++++---- weibo_spider/parser/index_parser.py | 22 ++++---- weibo_spider/parser/info_parser.py | 41 +++++++-------- weibo_spider/parser/mblog_picAll_parser.py | 6 +-- weibo_spider/parser/page_parser.py | 52 ++++++++----------- weibo_spider/parser/parser.py | 2 +- weibo_spider/parser/util.py | 19 ++++--- weibo_spider/spider.py | 56 ++++++++++++--------- weibo_spider/user.py | 12 ++--- weibo_spider/weibo.py | 14 +++--- weibo_spider/writer/csv_writer.py | 20 ++++---- weibo_spider/writer/json_writer.py | 29 ++++++----- weibo_spider/writer/mongo_writer.py | 25 +++++---- weibo_spider/writer/mysql_writer.py | 33 ++++++------ weibo_spider/writer/txt_writer.py | 52 ++++++++++--------- 21 files changed, 310 insertions(+), 246 deletions(-) create mode 100644 weibo_spider/logging.conf diff --git a/weibo_spider/__main__.py b/weibo_spider/__main__.py index 8050b179..f1eafa65 100644 --- a/weibo_spider/__main__.py +++ b/weibo_spider/__main__.py @@ -1,8 +1,8 @@ -from absl import app +import os +import sys -import sys,os +from absl import app sys.path.append(os.path.abspath(os.path.dirname(os.getcwd()))) - from weibo_spider.spider import main app.run(main) diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 71d04706..8dfd4421 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -1,8 +1,11 @@ import codecs +import logging import os import sys from datetime import datetime +logger = logging.getLogger('spider.config_util') + def _is_date(date_str): """判断日期格式是否正确""" @@ -23,38 +26,45 @@ def validate_config(config): argument_list = ['filter', 'pic_download', 'video_download'] for argument in argument_list: if config[argument] != 0 and config[argument] != 1: - sys.exit(u'%s值应为0或1,请重新输入' % config[argument]) + logger.warning(u'%s值应为0或1,请重新输入', config[argument]) + sys.exit() # 验证since_date since_date = str(config['since_date']) if (not _is_date(since_date)) and (not since_date.isdigit()): - sys.exit(u'since_date值应为yyyy-mm-dd形式或整数,请重新输入') + logger.warning(u'since_date值应为yyyy-mm-dd形式或整数,请重新输入') + sys.exit() # 验证end_date end_date = str(config['end_date']) if (not _is_date(end_date)) and (end_date != 'now'): - sys.exit(u'end_date值应为yyyy-mm-dd形式或"now",请重新输入') + logger.warning(u'end_date值应为yyyy-mm-dd形式或"now",请重新输入') + sys.exit() # 验证write_mode write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql'] if not isinstance(config['write_mode'], list): - sys.exit(u'write_mode值应为list类型') + logger.warning(u'write_mode值应为list类型') + sys.exit() for mode in config['write_mode']: if mode not in write_mode: - sys.exit( - u'%s为无效模式,请从txt、csv、json、mongo和mysql中挑选一个或多个作为write_mode' % + logger.warning( + u'%s为无效模式,请从txt、csv、json、mongo和mysql中挑选一个或多个作为write_mode', mode) + sys.exit() # 验证user_id_list user_id_list = config['user_id_list'] if (not isinstance(user_id_list, list)) and (not user_id_list.endswith('.txt')): - sys.exit(u'user_id_list值应为list类型或txt文件路径') + logger.warning(u'user_id_list值应为list类型或txt文件路径') + sys.exit() if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): user_id_list = os.getcwd() + os.sep + user_id_list if not os.path.isfile(user_id_list): - sys.exit(u'不存在%s文件' % user_id_list) + logger.warning(u'不存在%s文件', user_id_list) + sys.exit() def get_user_config_list(file_name, default_since_date): @@ -64,7 +74,8 @@ def get_user_config_list(file_name, default_since_date): lines = f.read().splitlines() lines = [line.decode('utf-8-sig') for line in lines] except UnicodeDecodeError: - sys.exit(u'%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序' % file_name) + logger.error(u'%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序', file_name) + sys.exit() user_config_list = [] for line in lines: info = line.split(' ') diff --git a/weibo_spider/downloader/downloader.py b/weibo_spider/downloader/downloader.py index 568160d2..b198b45b 100644 --- a/weibo_spider/downloader/downloader.py +++ b/weibo_spider/downloader/downloader.py @@ -1,21 +1,23 @@ # -*- coding: UTF-8 -*- +import logging import os import sys -import traceback - from abc import ABC, abstractmethod + import requests from requests.adapters import HTTPAdapter from tqdm import tqdm +logger = logging.getLogger('spider.downloader') + class Downloader(ABC): def __init__(self, file_dir): self.file_dir = file_dir - self.file_type = "" - self.describe = u"" - self.key = "" + self.file_type = '' + self.describe = u'' + self.key = '' @abstractmethod def handle_download(self, urls, w): @@ -30,8 +32,7 @@ def get_filepath(self): os.makedirs(file_dir) return file_dir except Exception as e: - print("Error: ", e) - traceback.print_exc() + logger.exception(e) def download_one_file(self, url, file_path, weibo_id): """下载单个文件(图片/视频)""" @@ -40,25 +41,23 @@ def download_one_file(self, url, file_path, weibo_id): s = requests.Session() s.mount(url, HTTPAdapter(max_retries=5)) downloaded = s.get(url, timeout=(5, 10)) - with open(file_path, "wb") as f: + with open(file_path, 'wb') as f: f.write(downloaded.content) except Exception as e: - error_file = self.get_filepath() + os.sep + "not_downloaded.txt" - with open(error_file, "ab") as f: - url = weibo_id + ":" + url + "\n" + error_file = self.get_filepath() + os.sep + 'not_downloaded.txt' + with open(error_file, 'ab') as f: + url = weibo_id + ':' + url + '\n' f.write(url.encode(sys.stdout.encoding)) - print("Error: ", e) - traceback.print_exc() + logger.exception(e) def download_files(self, weibos): """下载文件(图片/视频)""" try: - print(u"即将进行%s下载" % self.describe) - for w in tqdm(weibos, desc="Download progress"): - if getattr(w, self.key) != u"无": + logger.info(u'即将进行%s下载', self.describe) + for w in tqdm(weibos, desc='Download progress'): + if getattr(w, self.key) != u'无': self.handle_download(getattr(w, self.key), w) - print(u"%s下载完毕,保存路径:" % self.describe) - print(self.file_dir) + logger.info(u'%s下载完毕,保存路径:', self.describe) + logger.info(self.file_dir) except Exception as e: - print("Error: ", e) - traceback.print_exc() + logger.exception(e) diff --git a/weibo_spider/downloader/img_downloader.py b/weibo_spider/downloader/img_downloader.py index 03572258..5ae88673 100644 --- a/weibo_spider/downloader/img_downloader.py +++ b/weibo_spider/downloader/img_downloader.py @@ -7,28 +7,28 @@ class ImgDownloader(Downloader): def __init__(self, file_dir): self.file_dir = file_dir - self.file_type = "img" - self.describe = u"图片" - self.key = "original_pictures" + self.file_type = 'img' + self.describe = u'图片' + self.key = 'original_pictures' def handle_download(self, urls, w): """处理下载相关操作""" - file_prefix = w.publish_time[:11].replace("-", "") + "_" + w.id - if "," in urls: - url_list = urls.split(",") + file_prefix = w.publish_time[:11].replace('-', '') + '_' + w.id + if ',' in urls: + url_list = urls.split(',') for i, url in enumerate(url_list): - index = url.rfind(".") + index = url.rfind('.') if len(url) - index >= 5: - file_suffix = ".jpg" + file_suffix = '.jpg' else: file_suffix = url[index:] - file_name = file_prefix + "_" + str(i + 1) + file_suffix + file_name = file_prefix + '_' + str(i + 1) + file_suffix file_path = self.file_dir + os.sep + file_name self.download_one_file(url, file_path, w.id) else: - index = urls.rfind(".") + index = urls.rfind('.') if len(urls) - index > 5: - file_suffix = ".jpg" + file_suffix = '.jpg' else: file_suffix = urls[index:] file_name = file_prefix + file_suffix diff --git a/weibo_spider/downloader/video_downloader.py b/weibo_spider/downloader/video_downloader.py index ae9029e8..fc9cc480 100644 --- a/weibo_spider/downloader/video_downloader.py +++ b/weibo_spider/downloader/video_downloader.py @@ -7,14 +7,14 @@ class VideoDownloader(Downloader): def __init__(self, file_dir): self.file_dir = file_dir - self.file_type = "img" - self.describe = u"视频" - self.key = "video_url" + self.file_type = 'img' + self.describe = u'视频' + self.key = 'video_url' def handle_download(self, urls, w): """处理下载相关操作""" - file_prefix = w.publish_time[:11].replace("-", "") + "_" + w.id - file_suffix = ".mp4" + file_prefix = w.publish_time[:11].replace('-', '') + '_' + w.id + file_suffix = '.mp4' file_name = file_prefix + file_suffix file_path = self.file_dir + os.sep + file_name self.download_one_file(urls, file_path, w.id) diff --git a/weibo_spider/logging.conf b/weibo_spider/logging.conf new file mode 100644 index 00000000..84233792 --- /dev/null +++ b/weibo_spider/logging.conf @@ -0,0 +1,45 @@ +[loggers] +keys=root,spider + +[handlers] +keys=consoleHandler,fileHandler,errorHandler + +[formatters] +keys=consoleFormatter,fileFormatter,errorFormatter + +[logger_root] +level=DEBUG +handlers=consoleHandler,fileHandler,errorHandler + +[logger_spider] +level=DEBUG +handlers=consoleHandler,fileHandler,errorHandler +qualname=spider +propagate=0 + +[handler_consoleHandler] +class=StreamHandler +level=DEBUG +formatter=consoleFormatter +args=(sys.stdout,) + +[handler_fileHandler] +class=handlers.TimedRotatingFileHandler +level=INFO +formatter=fileFormatter +args=('all.log', 'D', 1, 5, 'utf-8', False, False) + +[handler_errorHandler] +class=FileHandler +level=WARNING +formatter=errorFormatter +args=('error.log', 'a','utf-8') + +[formatter_consoleFormatter] +format=%(message)s + +[formatter_fileFormatter] +format=%(asctime)s - %(filename)s - %(levelname)s - %(message)s + +[formatter_errorFormatter] +format=%(asctime)s - %(levelname)s - %(filename)s[:%(lineno)d] - %(message)s \ No newline at end of file diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/parser/comment_parser.py index ea28acb7..af6aa1e8 100644 --- a/weibo_spider/parser/comment_parser.py +++ b/weibo_spider/parser/comment_parser.py @@ -1,15 +1,17 @@ +import logging import random -import traceback from time import sleep from .parser import Parser -from .util import handle_html, handle_garbled +from .util import handle_garbled, handle_html + +logger = logging.getLogger('spider.comment_parser') class CommentParser(Parser): def __init__(self, cookie, weibo_id): self.cookie = cookie - self.url = "https://weibo.cn/comment/" + weibo_id + self.url = 'https://weibo.cn/comment/' + weibo_id self.selector = handle_html(self.cookie, self.url) def get_long_weibo(self): @@ -21,22 +23,20 @@ def get_long_weibo(self): info = self.selector.xpath("//div[@class='c']")[1] wb_content = handle_garbled(info) wb_time = info.xpath("//span[@class='ct']/text()")[0] - weibo_content = wb_content[wb_content.find(":") + + weibo_content = wb_content[wb_content.find(':') + 1:wb_content.rfind(wb_time)] if weibo_content is not None: return weibo_content sleep(random.randint(6, 10)) - except Exception as e: - return u"网络出错" - print("Error: ", e) - traceback.print_exc() + except Exception: + logger.exception(u'网络出错') + return u'网络出错' def get_long_retweet(self): """获取长转发微博""" try: wb_content = self.get_long_weibo() - weibo_content = wb_content[:wb_content.rfind(u"原文转发")] + weibo_content = wb_content[:wb_content.rfind(u'原文转发')] return weibo_content except Exception as e: - print("Error: ", e) - traceback.print_exc() + logger.exception(e) diff --git a/weibo_spider/parser/index_parser.py b/weibo_spider/parser/index_parser.py index db870053..2a82bb1a 100644 --- a/weibo_spider/parser/index_parser.py +++ b/weibo_spider/parser/index_parser.py @@ -1,15 +1,17 @@ -import traceback +import logging from .info_parser import InfoParser from .parser import Parser from .util import handle_html +logger = logging.getLogger('spider.index_parser') + class IndexParser(Parser): def __init__(self, cookie, user_uri): self.cookie = cookie self.user_uri = user_uri - self.url = "https://weibo.cn/%s" % (user_uri) + self.url = 'https://weibo.cn/%s' % (user_uri) self.selector = handle_html(self.cookie, self.url) def _get_user_id(self): @@ -17,10 +19,10 @@ def _get_user_id(self): user_id = self.user_uri url_list = self.selector.xpath("//div[@class='u']//a") for url in url_list: - if (url.xpath("string(.)")) == u"资料": - if url.xpath("@href") and url.xpath("@href")[0].endswith( - "/info"): - link = url.xpath("@href")[0] + if (url.xpath('string(.)')) == u'资料': + if url.xpath('@href') and url.xpath('@href')[0].endswith( + '/info'): + link = url.xpath('@href')[0] user_id = link[1:-5] break return user_id @@ -39,8 +41,7 @@ def get_user(self): self.user.followers = int(user_info[2][3:-1]) return self.user except Exception as e: - print("Error: ", e) - traceback.print_exc() + logger.exception(e) def get_page_num(self): """获取微博总页数""" @@ -49,8 +50,7 @@ def get_page_num(self): page_num = 1 else: page_num = (int)(self.selector.xpath("//input[@name='mp']") - [0].attrib["value"]) + [0].attrib['value']) return page_num except Exception as e: - print("Error: ", e) - traceback.print_exc() + logger.exception(e) diff --git a/weibo_spider/parser/info_parser.py b/weibo_spider/parser/info_parser.py index c0330bff..6928c7c2 100644 --- a/weibo_spider/parser/info_parser.py +++ b/weibo_spider/parser/info_parser.py @@ -1,55 +1,56 @@ +import logging import sys -import traceback +from ..user import User from .parser import Parser from .util import handle_html -from ..user import User +logger = logging.getLogger('spider.info_parser') class InfoParser(Parser): def __init__(self, cookie, user_id): self.cookie = cookie - self.url = "https://weibo.cn/%s/info" % (user_id) + self.url = 'https://weibo.cn/%s/info' % (user_id) self.selector = handle_html(self.cookie, self.url) def extract_user_info(self): """提取用户信息""" try: user = User() - nickname = self.selector.xpath("//title/text()")[0] + nickname = self.selector.xpath('//title/text()')[0] nickname = nickname[:-3] - if nickname == u"登录 - 新" or nickname == u"新浪": - sys.exit(u"cookie错误或已过期,请按照README中方法重新获取") + if nickname == u'登录 - 新' or nickname == u'新浪': + logger.warning(u'cookie错误或已过期,请按照README中方法重新获取') + sys.exit() user.nickname = nickname basic_info = self.selector.xpath("//div[@class='c'][3]/text()") - zh_list = [u"性别", u"地区", u"生日", u"简介", u"认证", u"达人"] + zh_list = [u'性别', u'地区', u'生日', u'简介', u'认证', u'达人'] en_list = [ - "gender", "location", "birthday", "description", - "verified_reason", "talent" + 'gender', 'location', 'birthday', 'description', + 'verified_reason', 'talent' ] for i in basic_info: - if i.split(":", 1)[0] in zh_list: - setattr(user, en_list[zh_list.index(i.split(":", 1)[0])], - i.split(":", 1)[1].replace("\u3000", "")) + if i.split(':', 1)[0] in zh_list: + setattr(user, en_list[zh_list.index(i.split(':', 1)[0])], + i.split(':', 1)[1].replace('\u3000', '')) if self.selector.xpath( - "//div[@class='tip'][2]/text()")[0] == u"学习经历": + "//div[@class='tip'][2]/text()")[0] == u'学习经历': user.education = self.selector.xpath( "//div[@class='c'][4]/text()")[0][1:].replace( - u"\xa0", u" ") + u'\xa0', u' ') if self.selector.xpath( - "//div[@class='tip'][3]/text()")[0] == u"工作经历": + "//div[@class='tip'][3]/text()")[0] == u'工作经历': user.work = self.selector.xpath( "//div[@class='c'][5]/text()")[0][1:].replace( - u"\xa0", u" ") + u'\xa0', u' ') elif self.selector.xpath( - "//div[@class='tip'][2]/text()")[0] == u"工作经历": + "//div[@class='tip'][2]/text()")[0] == u'工作经历': user.work = self.selector.xpath( "//div[@class='c'][4]/text()")[0][1:].replace( - u"\xa0", u" ") + u'\xa0', u' ') return user except Exception as e: - print("Error: ", e) - traceback.print_exc() + logger.exception(e) diff --git a/weibo_spider/parser/mblog_picAll_parser.py b/weibo_spider/parser/mblog_picAll_parser.py index d4e7381a..1bc66610 100644 --- a/weibo_spider/parser/mblog_picAll_parser.py +++ b/weibo_spider/parser/mblog_picAll_parser.py @@ -1,5 +1,3 @@ - - from .parser import Parser from .util import handle_html @@ -7,8 +5,8 @@ class MblogPicAllParser(Parser): def __init__(self, cookie, weibo_id): self.cookie = cookie - self.url = "https://weibo.cn/mblog/picAll/" + weibo_id + "?rl=1" + self.url = 'https://weibo.cn/mblog/picAll/' + weibo_id + '?rl=1' self.selector = handle_html(self.cookie, self.url) def extract_preview_picture_list(self): - return self.selector.xpath("//img/@src") + return self.selector.xpath('//img/@src') diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index aa4e6cc4..c7c1083a 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -1,6 +1,6 @@ +import logging import re import sys -import traceback from datetime import datetime, timedelta import requests @@ -12,6 +12,8 @@ from .parser import Parser from .util import handle_garbled, handle_html +logger = logging.getLogger('spider.page_parser') + class PageParser(Parser): def __init__(self, cookie, user_config, page, filter): @@ -56,14 +58,13 @@ def get_one_page(self, weibo_id_list): continue else: return weibos, weibo_id_list - print(weibo) - print('-' * 100) + logger.info(weibo) + logger.info('-' * 100) weibos.append(weibo) weibo_id_list.append(weibo.id) return weibos, weibo_id_list except Exception as e: - print('Error: ', e) - traceback.print_exc() + logger.exception(e) def is_original(self, info): """判断微博是否为原创微博""" @@ -86,8 +87,7 @@ def get_original_weibo(self, info, weibo_id): weibo_content = wb_content return weibo_content except Exception as e: - print('Error: ', e) - traceback.print_exc() + logger.exception(e) def get_retweet(self, info, weibo_id): """获取转发微博""" @@ -115,8 +115,7 @@ def get_retweet(self, info, weibo_id): weibo_content) return weibo_content except Exception as e: - print('Error: ', e) - traceback.print_exc() + logger.exception(e) def get_weibo_content(self, info, is_original): """获取微博内容""" @@ -128,8 +127,7 @@ def get_weibo_content(self, info, is_original): weibo_content = self.get_retweet(info, weibo_id) return weibo_content except Exception as e: - print('Error: ', e) - traceback.print_exc() + logger.exception(e) def get_article_url(self, info): """获取微博头条文章的url""" @@ -163,8 +161,7 @@ def get_publish_place(self, info): break return publish_place except Exception as e: - print('Error: ', e) - traceback.print_exc() + logger.exception(e) def get_publish_time(self, info): """获取微博发布时间""" @@ -195,8 +192,7 @@ def get_publish_time(self, info): publish_time = publish_time[:16] return publish_time except Exception as e: - print('Error: ', e) - traceback.print_exc() + logger.exception(e) def get_publish_tool(self, info): """获取微博发布工具""" @@ -209,8 +205,7 @@ def get_publish_tool(self, info): publish_tool = u'无' return publish_tool except Exception as e: - print('Error: ', e) - traceback.print_exc() + logger.exception(e) def get_weibo_footer(self, info): """获取微博点赞数、转发数、评论数""" @@ -232,8 +227,7 @@ def get_weibo_footer(self, info): footer['comment_num'] = comment_num return footer except Exception as e: - print('Error: ', e) - traceback.print_exc() + logger.exception(e) def get_picture_urls(self, info, is_original): """获取微博原始图片url""" @@ -259,12 +253,12 @@ def get_picture_urls(self, info, is_original): picture_urls['original_pictures'] = original_picture return picture_urls except Exception as e: - print('Error: ', e) - traceback.print_exc() + logger.exception(e) def get_video_url(self, info, is_original): """获取微博视频url""" try: + video_url = u'无' if is_original: div_first = info.xpath('div')[0] a_list = div_first.xpath('.//a') @@ -285,13 +279,10 @@ def get_video_url(self, info, is_original): video_url = wb_info['data']['object']['stream']['url'] if not video_url: # 说明该视频为直播 video_url = u'无' - else: - video_url = u'无' return video_url except Exception as e: + logger.exception(e) return u'无' - print('Error: ', e) - traceback.print_exc() def is_pinned_weibo(self, info): """判断微博是否为置顶微博""" @@ -329,11 +320,10 @@ def get_one_weibo(self, info): weibo.comment_num = footer['comment_num'] # 评论数 else: weibo = None - print(u'正在过滤转发微博') + logger.info(u'正在过滤转发微博') return weibo except Exception as e: - print('Error: ', e) - traceback.print_exc() + logger.exception(e) def extract_picture_urls(self, info, weibo_id): """提取微博原始图片url""" @@ -363,12 +353,12 @@ def extract_picture_urls(self, info, weibo_id): '/wap180/', '/large/') break else: - sys.exit( + logger.warning( u'爬虫微博可能被设置成了"不显示图片",请前往' u'"https://weibo.cn/account/customize/pic",修改为"显示"' ) + sys.exit() return picture_urls except Exception as e: + logger.exception(e) return u'无' - print('Error: ', e) - traceback.print_exc() diff --git a/weibo_spider/parser/parser.py b/weibo_spider/parser/parser.py index cee1a03d..41302ba4 100644 --- a/weibo_spider/parser/parser.py +++ b/weibo_spider/parser/parser.py @@ -1,5 +1,5 @@ class Parser: def __init__(self, cookie): self.cookie = cookie - self.url = "" + self.url = '' self.selector = None diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py index e311b02e..55f61372 100644 --- a/weibo_spider/parser/util.py +++ b/weibo_spider/parser/util.py @@ -1,6 +1,6 @@ import hashlib +import logging import sys -import traceback import requests from lxml import etree @@ -8,7 +8,8 @@ # Set GENERATE_TEST_DATA to True when generating test data. GENERATE_TEST_DATA = False TEST_DATA_DIR = 'tests/testdata' -URL_MAP_FILE = "url_map.json" +URL_MAP_FILE = 'url_map.json' +logger = logging.getLogger('spider.util') def hash_url(url): @@ -25,8 +26,8 @@ def handle_html(cookie, url): import json import os - resp_file = os.path.join(TEST_DATA_DIR, "%s.html" % hash_url(url)) - with io.open(resp_file, "w") as f: + resp_file = os.path.join(TEST_DATA_DIR, '%s.html' % hash_url(url)) + with io.open(resp_file, 'w') as f: f.write(resp.text) with io.open(os.path.join(TEST_DATA_DIR, URL_MAP_FILE), 'r+') as f: @@ -39,16 +40,14 @@ def handle_html(cookie, url): selector = etree.HTML(resp.content) return selector except Exception as e: - print("Error: ", e) - traceback.print_exc() + logger.exception(e) def handle_garbled(info): """处理乱码""" try: - info = (info.xpath("string(.)").replace(u"\u200b", "").encode( - sys.stdout.encoding, "ignore").decode(sys.stdout.encoding)) + info = (info.xpath('string(.)').replace(u'\u200b', '').encode( + sys.stdout.encoding, 'ignore').decode(sys.stdout.encoding)) return info except Exception as e: - print("Error: ", e) - traceback.print_exc() + logger.exception(e) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 3eed2607..3bb9f2bf 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -2,11 +2,12 @@ # -*- coding: UTF-8 -*- import json +import logging +import logging.config import os import random import shutil import sys -import traceback from datetime import date, datetime, timedelta from time import sleep @@ -23,6 +24,10 @@ flags.DEFINE_string('u', None, 'The user_id we want to input.') flags.DEFINE_string('user_id_list', None, 'The path to user_id_list.txt.') flags.DEFINE_string('output_dir', None, 'The dir path to store results.') +logging_path = os.path.split( + os.path.realpath(__file__))[0] + os.sep + 'logging.conf' +logging.config.fileConfig(logging_path) +logger = logging.getLogger('spider') class Spider: @@ -53,7 +58,8 @@ def __init__(self, config): if not os.path.isabs(user_id_list): user_id_list = os.getcwd() + os.sep + user_id_list if not os.path.isfile(user_id_list): - sys.exit(u'不存在%s文件' % user_id_list) + logger.warning('不存在%s文件', user_id_list) + sys.exit() self.user_config_file_path = user_id_list if FLAGS.u: user_id_list = FLAGS.u.split(',') @@ -118,13 +124,14 @@ def get_weibo_info(self): self.cookie, self.user_config, page, self.filter).get_one_page( self.weibo_id_list) # 获取第page页的全部微博 - print(u'{}已获取{}({})的第{}页微博{}'.format( + logger.info( + u'%s已获取%s(%s)的第%d页微博%s', '-' * 30, self.user.nickname, self.user.id, page, '-' * 30, - )) + ) if weibos: yield weibos else: @@ -138,8 +145,7 @@ def get_weibo_info(self): page1 = page random_pages = random.randint(1, 5) except Exception as e: - print('Error: ', e) - traceback.print_exc() + logger.exception(e) def _get_filepath(self, type): """获取结果文件路径""" @@ -158,8 +164,7 @@ def _get_filepath(self, type): file_path = file_dir + os.sep + self.user.id + '.' + type return file_path except Exception as e: - print('Error: ', e) - traceback.print_exc() + logger.exception(e) def initialize_info(self, user_config): """初始化爬虫信息""" @@ -206,27 +211,27 @@ def start(self): """运行爬虫""" try: if not self.user_config_list: - print( + logger.info( u'没有配置有效的user_id,请通过config.json或user_id_list.txt配置user_id') return for user_config in self.user_config_list: self.get_user_info(user_config['user_uri']) - print(self.user) - print('*' * 100) + logger.info(self.user) + logger.info('*' * 100) self.initialize_info(user_config) self.write_user(self.user) - print('*' * 100) + logger.info('*' * 100) for weibos in self.get_weibo_info(): self.write_weibo(weibos) self.got_num += len(weibos) if not self.filter: - print(u'共爬取' + str(self.got_num) + u'条微博') + logger.info(u'共爬取' + str(self.got_num) + u'条微博') else: - print(u'共爬取' + str(self.got_num) + u'条原创微博') - print(u'信息抓取完毕') - print('*' * 100) + logger.info(u'共爬取' + str(self.got_num) + u'条原创微博') + logger.info(u'信息抓取完毕') + logger.info('*' * 100) if self.user_config_file_path or FLAGS.u: config_util.update_user_config_file( @@ -236,8 +241,7 @@ def start(self): self.new_since_date, ) except Exception as e: - print('Error: ', e) - traceback.print_exc() + logger.exception(e) def _get_config(): @@ -249,16 +253,19 @@ def _get_config(): config_path = FLAGS.config_path elif not os.path.isfile(config_path): shutil.copy(src, config_path) - sys.exit(u'请先配置当前目录(%s)下的config.json文件,' - u'如果想了解config.json参数的具体意义及配置方法,请访问\n' - u'https://github.com/dataabc/weiboSpider#2程序设置' % os.getcwd()) + logger.info(u'请先配置当前目录(%s)下的config.json文件,' + u'如果想了解config.json参数的具体意义及配置方法,请访问\n' + u'https://github.com/dataabc/weiboSpider#2程序设置' % + os.getcwd()) + sys.exit() try: with open(config_path) as f: config = json.loads(f.read()) return config except ValueError: - sys.exit(u'config.json 格式不正确,请访问 ' - u'https://github.com/dataabc/weiboSpider#2程序设置') + logger.error(u'config.json 格式不正确,请访问 ' + u'https://github.com/dataabc/weiboSpider#2程序设置') + sys.exit() def main(_): @@ -268,8 +275,7 @@ def main(_): wb = Spider(config) wb.start() # 爬取微博信息 except Exception as e: - print('Error: ', e) - traceback.print_exc() + logger.exception(e) if __name__ == '__main__': diff --git a/weibo_spider/user.py b/weibo_spider/user.py index 29e47f3d..dc135799 100644 --- a/weibo_spider/user.py +++ b/weibo_spider/user.py @@ -20,10 +20,10 @@ def __init__(self): def __str__(self): """打印微博用户信息""" - result = "" - result += u"用户昵称: %s\n" % self.nickname - result += u"用户id: %s\n" % self.id - result += u"微博数: %d\n" % self.weibo_num - result += u"关注数: %d\n" % self.following - result += u"粉丝数: %d\n" % self.followers + result = '' + result += u'用户昵称: %s\n' % self.nickname + result += u'用户id: %s\n' % self.id + result += u'微博数: %d\n' % self.weibo_num + result += u'关注数: %d\n' % self.following + result += u'粉丝数: %d\n' % self.followers return result diff --git a/weibo_spider/weibo.py b/weibo_spider/weibo.py index a5a1d98f..54cec7ff 100644 --- a/weibo_spider/weibo.py +++ b/weibo_spider/weibo.py @@ -22,11 +22,11 @@ def __init__(self): def __str__(self): """打印一条微博""" result = self.content + '\n' - result += u"微博发布位置:%s\n" % self.publish_place - result += u"发布时间:%s\n" % self.publish_time - result += u"发布工具:%s\n" % self.publish_tool - result += u"点赞数:%d\n" % self.up_num - result += u"转发数:%d\n" % self.retweet_num - result += u"评论数:%d\n" % self.comment_num - result += u"url:https://weibo.cn/comment/%s\n" % self.id + result += u'微博发布位置:%s\n' % self.publish_place + result += u'发布时间:%s\n' % self.publish_time + result += u'发布工具:%s\n' % self.publish_tool + result += u'点赞数:%d\n' % self.up_num + result += u'转发数:%d\n' % self.retweet_num + result += u'评论数:%d\n' % self.comment_num + result += u'url:https://weibo.cn/comment/%s\n' % self.id return result diff --git a/weibo_spider/writer/csv_writer.py b/weibo_spider/writer/csv_writer.py index eb0e0b4e..193803da 100644 --- a/weibo_spider/writer/csv_writer.py +++ b/weibo_spider/writer/csv_writer.py @@ -1,8 +1,10 @@ import csv -import traceback +import logging from .writer import Writer +logger = logging.getLogger('spider.csv_writer') + class CsvWriter(Writer): def __init__(self, file_path, filter): @@ -20,13 +22,12 @@ def __init__(self, file_path, filter): self.result_headers.insert(4, ('被转发微博原始图片url', 'retweet_pictures')) self.result_headers.insert(5, ('是否为原创微博', 'original')) try: - with open(self.file_path, "a", encoding="utf-8-sig", - newline="") as f: + with open(self.file_path, 'a', encoding='utf-8-sig', + newline='') as f: writer = csv.writer(f) writer.writerows([[kv[0] for kv in self.result_headers]]) except Exception as e: - print("Error: ", e) - traceback.print_exc() + logger.exception(e) def write_user(self, user): self.user = user @@ -36,11 +37,10 @@ def write_weibo(self, weibos): try: result_data = [[w.__dict__[kv[1]] for kv in self.result_headers] for w in weibos] - with open(self.file_path, "a", encoding="utf-8-sig", - newline="") as f: + with open(self.file_path, 'a', encoding='utf-8-sig', + newline='') as f: writer = csv.writer(f) writer.writerows(result_data) - print(u"%d条微博写入csv文件完毕,保存路径:%s" % (len(weibos), self.file_path)) + logger.info(u'%d条微博写入csv文件完毕,保存路径:%s', len(weibos), self.file_path) except Exception as e: - print("Error: ", e) - traceback.print_exc() + logger.exception(e) diff --git a/weibo_spider/writer/json_writer.py b/weibo_spider/writer/json_writer.py index 7d1dfbff..bca61c2d 100644 --- a/weibo_spider/writer/json_writer.py +++ b/weibo_spider/writer/json_writer.py @@ -1,9 +1,12 @@ import codecs import json +import logging import os from .writer import Writer +logger = logging.getLogger('spider.json_writer') + class JsonWriter(Writer): def __init__(self, file_path): @@ -14,36 +17,36 @@ def write_user(self, user): def _update_json_data(self, data, weibo_info): """更新要写入json结果文件中的数据,已经存在于json中的信息更新为最新值,不存在的信息添加到data中""" - data["user"] = self.user.__dict__ - if data.get("weibo"): + data['user'] = self.user.__dict__ + if data.get('weibo'): is_new = 1 # 待写入微博是否全部为新微博,即待写入微博与json中的数据不重复 - for old in data["weibo"]: - if weibo_info[-1]["id"] == old["id"]: + for old in data['weibo']: + if weibo_info[-1]['id'] == old['id']: is_new = 0 break if is_new == 0: for new in weibo_info: flag = 1 - for i, old in enumerate(data["weibo"]): - if new["id"] == old["id"]: - data["weibo"][i] = new + for i, old in enumerate(data['weibo']): + if new['id'] == old['id']: + data['weibo'][i] = new flag = 0 break if flag: - data["weibo"].append(new) + data['weibo'].append(new) else: - data["weibo"] += weibo_info + data['weibo'] += weibo_info else: - data["weibo"] = weibo_info + data['weibo'] = weibo_info return data def write_weibo(self, weibos): """将爬到的信息写入json文件""" data = {} if os.path.isfile(self.file_path): - with codecs.open(self.file_path, "r", encoding="utf-8") as f: + with codecs.open(self.file_path, 'r', encoding='utf-8') as f: data = json.load(f) data = self._update_json_data(data, [w.__dict__ for w in weibos]) - with codecs.open(self.file_path, "w", encoding="utf-8") as f: + with codecs.open(self.file_path, 'w', encoding='utf-8') as f: f.write(json.dumps(data, indent=4, ensure_ascii=False)) - print(u"%d条微博写入json文件完毕,保存路径:%s" % (len(weibos), self.file_path)) + logger.info(u'%d条微博写入json文件完毕,保存路径:%s', len(weibos), self.file_path) diff --git a/weibo_spider/writer/mongo_writer.py b/weibo_spider/writer/mongo_writer.py index d21aa445..6a76e687 100644 --- a/weibo_spider/writer/mongo_writer.py +++ b/weibo_spider/writer/mongo_writer.py @@ -1,8 +1,11 @@ import copy +import logging import sys from .writer import Writer +logger = logging.getLogger('spider.mongo_writer') + class MongoWriter(Writer): def __init__(self): @@ -13,21 +16,25 @@ def _info_to_mongodb(self, collection, info_list): try: import pymongo except ImportError: - sys.exit(u"系统中可能没有安装pymongo库,请先运行 pip install pymongo ,再运行程序") + logger.warning( + u'系统中可能没有安装pymongo库,请先运行 pip install pymongo ,再运行程序') + sys.exit() try: from pymongo import MongoClient client = MongoClient() - db = client["weibo"] + db = client['weibo'] collection = db[collection] new_info_list = copy.deepcopy(info_list) for info in new_info_list: - if not collection.find_one({"id": info["id"]}): + if not collection.find_one({'id': info['id']}): collection.insert_one(info) else: - collection.update_one({"id": info["id"]}, {"$set": info}) + collection.update_one({'id': info['id']}, {'$set': info}) except pymongo.errors.ServerSelectionTimeoutError: - sys.exit(u"系统中可能没有安装或启动MongoDB数据库,请先根据系统环境安装或启动MongoDB,再运行程序") + logger.warning( + u'系统中可能没有安装或启动MongoDB数据库,请先根据系统环境安装或启动MongoDB,再运行程序') + sys.exit() def write_weibo(self, weibos): """将爬取的微博信息写入MongoDB数据库""" @@ -35,12 +42,12 @@ def write_weibo(self, weibos): for w in weibos: w.user_id = self.user.id weibo_list.append(w.__dict__) - self._info_to_mongodb("weibo", weibo_list) - print(u"%d条微博写入MongoDB数据库完毕" % len(weibos)) + self._info_to_mongodb('weibo', weibo_list) + logger.info(u'%d条微博写入MongoDB数据库完毕', len(weibos)) def write_user(self, user): """将爬取的用户信息写入MongoDB数据库""" self.user = user user_list = [user.__dict__] - self._info_to_mongodb("user", user_list) - print(u"%s信息写入MongoDB数据库完毕" % user.nickname) + self._info_to_mongodb('user', user_list) + logger.info(u'%s信息写入MongoDB数据库完毕', user.nickname) diff --git a/weibo_spider/writer/mysql_writer.py b/weibo_spider/writer/mysql_writer.py index 070cee1e..d395b123 100644 --- a/weibo_spider/writer/mysql_writer.py +++ b/weibo_spider/writer/mysql_writer.py @@ -1,9 +1,11 @@ import copy +import logging import sys -import traceback from .writer import Writer +logger = logging.getLogger('spider.mysql_writer') + class MySqlWriter(Writer): def __init__(self, mysql_config): @@ -13,7 +15,7 @@ def __init__(self, mysql_config): create_database = """CREATE DATABASE IF NOT EXISTS weibo DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci""" self._mysql_create_database(create_database) - self.mysql_config["db"] = "weibo" + self.mysql_config['db'] = 'weibo' def _mysql_create(self, connection, sql): """创建MySQL数据库或表""" @@ -28,13 +30,15 @@ def _mysql_create_database(self, sql): try: import pymysql except ImportError: - sys.exit(u"系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序") + logger.warning( + u'系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序') + sys.exit() try: - print(self.mysql_config, sql) connection = pymysql.connect(**self.mysql_config) self._mysql_create(connection, sql) except pymysql.OperationalError: - sys.exit(u"系统中可能没有安装或正确配置MySQL数据库,请先根据系统环境安装或配置MySQL,再运行程序") + logger.warning(u'系统中可能没有安装或正确配置MySQL数据库,请先根据系统环境安装或配置MySQL,再运行程序') + sys.exit() def _mysql_create_table(self, sql): """创建MySQL表""" @@ -51,16 +55,16 @@ def _mysql_insert(self, table, data_list): for k, v in data.items() if v is not None} for data in data_list] - keys = ", ".join(data_list[0].keys()) - values = ", ".join(["%s"] * len(data_list[0])) + keys = ', '.join(data_list[0].keys()) + values = ', '.join(['%s'] * len(data_list[0])) connection = pymysql.connect(**self.mysql_config) cursor = connection.cursor() sql = """INSERT INTO {table}({keys}) VALUES ({values}) ON DUPLICATE KEY UPDATE""".format(table=table, keys=keys, values=values) - update = ",".join([ - " {key} = values({key})".format(key=key) + update = ','.join([ + ' {key} = values({key})'.format(key=key) for key in data_list[0] ]) sql += update @@ -70,8 +74,7 @@ def _mysql_insert(self, table, data_list): connection.commit() except Exception as e: connection.rollback() - print("Error: ", e) - traceback.print_exc() + logger.exception(e) finally: connection.close() @@ -103,8 +106,8 @@ def write_weibo(self, weibos): for weibo in info_list: weibo.user_id = self.user.id weibo_list.append(weibo.__dict__) - self._mysql_insert("weibo", weibo_list) - print(u"%d条微博写入MySQL数据库完毕" % len(weibos)) + self._mysql_insert('weibo', weibo_list) + logger.info(u'%d条微博写入MySQL数据库完毕', len(weibos)) def write_user(self, user): """将爬取的用户信息写入MySQL数据库""" @@ -129,5 +132,5 @@ def write_user(self, user): PRIMARY KEY (id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" self._mysql_create_table(create_table) - self._mysql_insert("user", [user.__dict__]) - print(u"%s信息写入MySQL数据库完毕" % user.nickname) + self._mysql_insert('user', [user.__dict__]) + logger.info(u'%s信息写入MySQL数据库完毕', user.nickname) diff --git a/weibo_spider/writer/txt_writer.py b/weibo_spider/writer/txt_writer.py index 24ceb66d..6eddd862 100644 --- a/weibo_spider/writer/txt_writer.py +++ b/weibo_spider/writer/txt_writer.py @@ -1,55 +1,57 @@ +import logging import sys -import traceback from .writer import Writer +logger = logging.getLogger('spider.txt_writer') + class TxtWriter(Writer): def __init__(self, file_path, filter): self.file_path = file_path - self.user_header = u"用户信息" - self.user_desc = [("nickname", "用户昵称"), ("id", "用户id"), - ("weibo_num", "微博数"), ("following", "关注数"), - ("followers", "粉丝数")] + self.user_header = u'用户信息' + self.user_desc = [('nickname', '用户昵称'), ('id', '用户id'), + ('weibo_num', '微博数'), ('following', '关注数'), + ('followers', '粉丝数')] if filter: - self.weibo_header = u"原创微博内容" + self.weibo_header = u'原创微博内容' else: - self.weibo_header = u"微博内容" - self.weibo_desc = [("publish_place", "微博位置"), ("publish_time", "发布时间"), - ("up_num", "点赞数"), ("retweet_num", "转发数"), - ("comment_num", "评论数"), ("publish_tool", "发布工具")] + self.weibo_header = u'微博内容' + self.weibo_desc = [('publish_place', '微博位置'), ('publish_time', '发布时间'), + ('up_num', '点赞数'), ('retweet_num', '转发数'), + ('comment_num', '评论数'), ('publish_tool', '发布工具')] def write_user(self, user): self.user = user - user_info = "\n".join( - [v + ":" + str(self.user.__dict__[k]) for k, v in self.user_desc]) + user_info = '\n'.join( + [v + ':' + str(self.user.__dict__[k]) for k, v in self.user_desc]) - with open(self.file_path, "ab") as f: - f.write((self.user_header + ":\n" + user_info + "\n\n").encode( + with open(self.file_path, 'ab') as f: + f.write((self.user_header + ':\n' + user_info + '\n\n').encode( sys.stdout.encoding)) - print(u"%s信息写入txt文件完毕,保存路径:%s" % (self.user.nickname, self.file_path)) + logger.info(u'%s信息写入txt文件完毕,保存路径:%s', self.user.nickname, + self.file_path) def write_weibo(self, weibo): """将爬取的信息写入txt文件""" - weibo_header = "" + weibo_header = '' if self.weibo_header: - weibo_header = self.weibo_header + ":\n" - self.weibo_header = "" + weibo_header = self.weibo_header + ':\n' + self.weibo_header = '' try: temp_result = [] for w in weibo: - temp_result.append(w.__dict__["content"] + "\n" + "\n".join( - [v + ":" + str(w.__dict__[k]) + temp_result.append(w.__dict__['content'] + '\n' + '\n'.join( + [v + ':' + str(w.__dict__[k]) for k, v in self.weibo_desc])) - result = "\n\n".join(temp_result) + "\n\n" + result = '\n\n'.join(temp_result) + '\n\n' - with open(self.file_path, "ab") as f: + with open(self.file_path, 'ab') as f: f.write((weibo_header + result).encode(sys.stdout.encoding)) - print(u"%d条微博写入txt文件完毕,保存路径:%s" % (len(weibo), self.file_path)) + logger.info(u'%d条微博写入txt文件完毕,保存路径:%s', len(weibo), self.file_path) except Exception as e: - print("Error: ", e) - traceback.print_exc() + logger.exception(e) From 0563200fc0847aa55283fc32c424fa9c7fb9cd8a Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Mon, 20 Jul 2020 01:13:56 +0800 Subject: [PATCH 248/363] Create contributors.md --- docs/contributors.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 docs/contributors.md diff --git a/docs/contributors.md b/docs/contributors.md new file mode 100644 index 00000000..03fa9b3e --- /dev/null +++ b/docs/contributors.md @@ -0,0 +1,14 @@ +# 贡献者 + +感谢所有为本项目作出贡献和将要作出贡献的朋友,感谢对开源事业的支持。大家每贡献一行code都让项目功能更丰富,每提一个建议都让程序更完善,每发现一个bug都让代码更健壮,所有美好的事物终将成就伟大。 + +本项目贡献者包含三部分:主要代码开发者、代码贡献者和优质issue提出者。以下按贡献者的用户名首字母列出,若某贡献者在多部分都有贡献,则以主要贡献为准。 + +## 主要代码开发者 + +[dataabc](https://github.com/dataabc) [songzy12](https://github.com/songzy12) + +## 代码贡献者 +[duangan1](https://github.com/duangan1) [codermino](https://github.com/codermino) + +## 优质issue提出者 From 1cfc52684116cc16235977cd706143e365a9d244 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 22 Jul 2020 02:09:12 +0800 Subject: [PATCH 249/363] Update contributors.md --- docs/contributors.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/contributors.md b/docs/contributors.md index 03fa9b3e..e8aac6e0 100644 --- a/docs/contributors.md +++ b/docs/contributors.md @@ -6,9 +6,12 @@ ## 主要代码开发者 -[dataabc](https://github.com/dataabc) [songzy12](https://github.com/songzy12) +|[dataabc](https://github.com/dataabc) |[songzy12](https://github.com/songzy12) | +| - | - | ## 代码贡献者 -[duangan1](https://github.com/duangan1) [codermino](https://github.com/codermino) + +|[codermino](https://github.com/codermino) |[duangan1](https://github.com/duangan1) | +| - | - | ## 优质issue提出者 From cb1abe7c06c4b244c8e0cb8ebd599a8de91f9c89 Mon Sep 17 00:00:00 2001 From: dataabc Date: Wed, 22 Jul 2020 21:24:23 +0800 Subject: [PATCH 250/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96not=5Fdownloa?= =?UTF-8?q?ded.txt=E5=86=85=E5=AE=B9=E5=86=99=E5=85=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 由 weibo_id + url形式 改为 weibo_id + file_path + url形式 --- weibo_spider/downloader/downloader.py | 16 ++-------------- weibo_spider/downloader/img_downloader.py | 2 -- weibo_spider/downloader/video_downloader.py | 2 -- 3 files changed, 2 insertions(+), 18 deletions(-) diff --git a/weibo_spider/downloader/downloader.py b/weibo_spider/downloader/downloader.py index b198b45b..63b1ee26 100644 --- a/weibo_spider/downloader/downloader.py +++ b/weibo_spider/downloader/downloader.py @@ -14,8 +14,6 @@ class Downloader(ABC): def __init__(self, file_dir): self.file_dir = file_dir - - self.file_type = '' self.describe = u'' self.key = '' @@ -24,16 +22,6 @@ def handle_download(self, urls, w): """下载 urls 里所指向的图片或视频文件,使用 w 里的信息来生成文件名""" pass - def get_filepath(self): - """获取结果文件路径""" - try: - file_dir = self.file_dir + os.sep + self.file_type - if not os.path.isdir(file_dir): - os.makedirs(file_dir) - return file_dir - except Exception as e: - logger.exception(e) - def download_one_file(self, url, file_path, weibo_id): """下载单个文件(图片/视频)""" try: @@ -44,9 +32,9 @@ def download_one_file(self, url, file_path, weibo_id): with open(file_path, 'wb') as f: f.write(downloaded.content) except Exception as e: - error_file = self.get_filepath() + os.sep + 'not_downloaded.txt' + error_file = self.file_dir + os.sep + 'not_downloaded.txt' with open(error_file, 'ab') as f: - url = weibo_id + ':' + url + '\n' + url = weibo_id + ':' + file_path + ':' + url + '\n' f.write(url.encode(sys.stdout.encoding)) logger.exception(e) diff --git a/weibo_spider/downloader/img_downloader.py b/weibo_spider/downloader/img_downloader.py index 5ae88673..88274062 100644 --- a/weibo_spider/downloader/img_downloader.py +++ b/weibo_spider/downloader/img_downloader.py @@ -6,8 +6,6 @@ class ImgDownloader(Downloader): def __init__(self, file_dir): self.file_dir = file_dir - - self.file_type = 'img' self.describe = u'图片' self.key = 'original_pictures' diff --git a/weibo_spider/downloader/video_downloader.py b/weibo_spider/downloader/video_downloader.py index fc9cc480..cc0171ba 100644 --- a/weibo_spider/downloader/video_downloader.py +++ b/weibo_spider/downloader/video_downloader.py @@ -6,8 +6,6 @@ class VideoDownloader(Downloader): def __init__(self, file_dir): self.file_dir = file_dir - - self.file_type = 'img' self.describe = u'视频' self.key = 'video_url' From 3f623d8440f4cf0f0d5d8823839b8eeab1772f07 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Fri, 24 Jul 2020 01:29:19 +0800 Subject: [PATCH 251/363] Update contributors.md --- docs/contributors.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/contributors.md b/docs/contributors.md index e8aac6e0..40082f1e 100644 --- a/docs/contributors.md +++ b/docs/contributors.md @@ -1,8 +1,8 @@ # 贡献者 -感谢所有为本项目作出贡献和将要作出贡献的朋友,感谢对开源事业的支持。大家每贡献一行code都让项目功能更丰富,每提一个建议都让程序更完善,每发现一个bug都让代码更健壮,所有美好的事物终将成就伟大。 +感谢所有为本项目作出贡献和将要作出贡献的朋友,感谢对开源事业的支持。大家每贡献一行code都让项目功能更丰富,每提一个建议都让程序更完善,每发现一个bug都让代码更健壮。 -本项目贡献者包含三部分:主要代码开发者、代码贡献者和优质issue提出者。以下按贡献者的用户名首字母列出,若某贡献者在多部分都有贡献,则以主要贡献为准。 +本项目贡献者包含三部分:主要代码开发者、代码贡献者和优质issue提出者。以下按贡献者的用户名首字母排序,若某贡献者在多部分都有贡献,则以主要贡献为准。 ## 主要代码开发者 @@ -15,3 +15,12 @@ | - | - | ## 优质issue提出者 + +| | | | | | | +| - | - | - | - | - | - | +| [13531982270](https://github.com/13531982270) | [Archenemy61](https://github.com/Archenemy61) | [arctanx](https://github.com/arctanx) |[bossming](https://github.com/bossming)|[bubblesran](https://github.com/bubblesran)| [cangling](https://github.com/cangling)| +| [Ccccche](https://github.com/Ccccche) | [Evifly](https://github.com/Evifly) | [gudaost](https://github.com/gudaost) | [Hylan129](https://github.com/Hylan129) | [HZzzzy](https://github.com/HZzzzy) | [kur0mi](https://github.com/kur0mi) | +| [leonall](https://github.com/leonall) | [liu-song](https://github.com/liu-song) | [Issac110](https://github.com/Issac110) | [MengyingQian](https://github.com/MengyingQian) | [PandGnone](https://github.com/PandGnone) | [PLQin](https://github.com/PLQin) | +| [redMUSCLE](https://github.com/redMUSCLE) | [shengdade](https://github.com/shengdade) | [softrime](https://github.com/softrime) | [SugimitoYuuji](https://github.com/SugimitoYuuji) | [sunbat](https://github.com/sunbat) | [taichifox95](https://github.com/taichifox95) | +| [Twinklingcode](https://github.com/Twinklingcode) | [vincentlee5](https://github.com/vincentlee5) | [wiidi](https://github.com/wiidi) | [wwwpf](https://github.com/wwwpf) | [xiaomingdaily](https://github.com/xiaomingdaily) | [xiekeyi98](https://github.com/xiekeyi98) | +| [xnzmc](https://github.com/xnzmc) | [yangy9593](https://github.com/yangy9593) | [zhangjibao](https://github.com/zhangjibao) | From 16191dbd734378df3d6eec92254711b61b8ee23f Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Fri, 24 Jul 2020 01:46:18 +0800 Subject: [PATCH 252/363] Update README.md --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 49a6f54e..f2e3dbe5 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,8 @@ - [如何获取user_id](#如何获取user_id) - [常见问题](#常见问题) - [相关项目](#相关项目) + - [贡献](#贡献) + - [贡献者](#贡献者) - [注意事项](#注意事项) ## 获取到的字段 @@ -229,6 +231,14 @@ $ python3 -m weibo_spider --u="1669879400,1223178222" - [weibo-crawler](https://github.com/dataabc/weibo-crawler) - 功能和本项目完全一样,可以不添加cookie,获取的微博属性更多; - [weibo-search](https://github.com/dataabc/weibo-search) - 可以连续获取一个或多个**微博关键词搜索**结果,并将结果写入文件(可选)、数据库(可选)等。所谓微博关键词搜索即:**搜索正文中包含指定关键词的微博**,可以指定搜索的时间范围。对于非常热门的关键词,一天的时间范围,可以获得**1000万**以上的搜索结果,N天的时间范围就可以获得1000万 X N搜索结果。对于大多数关键词,一天产生的相应微博数量应该在1000万条以下,因此可以说该程序可以获得大部分关键词的全部或近似全部的搜索结果。而且该程序可以获得搜索结果的所有信息,本程序获得的微博信息该程序都能获得。 +## 贡献 + +欢迎为本项目贡献力量。贡献可以是提交代码,可以是通过issue提建议(如新功能、改进方案等),也可以是通过issue告知我们项目存在哪些bug、缺点等,具体贡献方式见[为本项目做贡献](https://github.com/dataabc/weiboSpider/blob/master/CONTRIBUTING.md)。 + +## 贡献者 + +感谢所有为本项目贡献力量的朋友,贡献者详情见[贡献者](https://github.com/dataabc/weiboSpider/blob/master/docs/contributors.md)页面。 + ## 注意事项 1.user_id不能为爬虫微博的user_id。因为要爬微博信息,必须先登录到某个微博账号,此账号我们姑且称为爬虫微博。爬虫微博访问自己的页面和访问其他用户的页面,得到的网页格式不同,所以无法爬取自己的微博信息;如果想要爬取爬虫微博内容,可以参考[获取自身微博信息](https://github.com/dataabc/weiboSpider/issues/113); From 79f5bd0fdfa7c7df6e53d4de45991a5dac9d5e37 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sat, 25 Jul 2020 01:09:54 +0800 Subject: [PATCH 253/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=BD=93since?= =?UTF-8?q?=5Fdate=E4=B8=BA=E6=9C=AA=E6=9D=A5=E6=97=B6=EF=BC=8Cuser=5Fid?= =?UTF-8?q?=5Flist.txt=E7=94=9F=E6=88=90=E7=9A=84=E6=96=B0since=5Fdate?= =?UTF-8?q?=E4=B8=BA=E7=A9=BA=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/spider.py | 10 ++++------ weibo_spider/user_id_list.txt | 4 +++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 3bb9f2bf..be9175f9 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -114,11 +114,6 @@ def get_weibo_info(self): self.user_config['user_uri']).get_page_num() # 获取微博总页数 page1 = 0 random_pages = random.randint(1, 5) - if self.end_date == 'now': - self.new_since_date = datetime.now().strftime( - '%Y-%m-%d %H:%M') - else: - self.new_since_date = self.end_date for page in tqdm(range(1, page_num + 1), desc='Progress'): weibos, self.weibo_id_list = PageParser( self.cookie, @@ -171,7 +166,10 @@ def initialize_info(self, user_config): self.got_num = 0 self.user_config = user_config self.weibo_id_list = [] - + if self.end_date == 'now': + self.new_since_date = datetime.now().strftime('%Y-%m-%d %H:%M') + else: + self.new_since_date = self.end_date self.writers = [] if 'csv' in self.write_mode: from .writer import CsvWriter diff --git a/weibo_spider/user_id_list.txt b/weibo_spider/user_id_list.txt index 7a9ac042..ead74227 100644 --- a/weibo_spider/user_id_list.txt +++ b/weibo_spider/user_id_list.txt @@ -1 +1,3 @@ -7053204102 majiko 2020-06-02 09:37 09:32 09:30 09:29 09:28 09:24 09:13 \ No newline at end of file +1669879400 Dear-迪丽热巴 2020-01-13 19:18 +1223178222 胡歌 2020-01-13 19:28 +1729370543 郭碧婷 2020-01-13 19:33 \ No newline at end of file From 6943914cdd07b2544368164a29fa1e93290f1709 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sat, 25 Jul 2020 01:22:44 +0800 Subject: [PATCH 254/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E5=BE=AE?= =?UTF-8?q?=E5=8D=9A=E9=97=B4=E7=9A=84=E6=97=B6=E9=97=B4=E7=AD=89=E5=BE=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #192 --- weibo_spider/spider.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index be9175f9..30bed18e 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -212,7 +212,15 @@ def start(self): logger.info( u'没有配置有效的user_id,请通过config.json或user_id_list.txt配置user_id') return + user_count = 0 + user_count1 = random.randint(1, 5) + random_users = random.randint(1, 5) for user_config in self.user_config_list: + if (user_count - user_count1) % random_users == 0: + sleep(random.randint(6, 10)) + user_count1 = user_count + random_users = random.randint(1, 5) + user_count += 1 self.get_user_info(user_config['user_uri']) logger.info(self.user) logger.info('*' * 100) From 8bc5b920aa1f2ed60d4484920f8b7df6aab702b5 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sat, 25 Jul 2020 19:20:58 +0800 Subject: [PATCH 255/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E9=80=9A?= =?UTF-8?q?=E8=BF=87config.json=E6=8E=A7=E5=88=B6=E9=9A=8F=E6=9C=BA?= =?UTF-8?q?=E7=AD=89=E5=BE=85=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit random_wait_pages代表随机等待频率,默认是[1, 5],代表每1到5页等待一次;random_wait_seconds代表随机等待时间,默认是[6, 10],代表每次等待6到10秒。 Issue #192 --- weibo_spider/config_sample.json | 4 +++- weibo_spider/spider.py | 24 +++++++++++++++++------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/weibo_spider/config_sample.json b/weibo_spider/config_sample.json index b0325c28..f7449dd7 100644 --- a/weibo_spider/config_sample.json +++ b/weibo_spider/config_sample.json @@ -2,7 +2,9 @@ "user_id_list": ["1669879400"], "filter": 1, "since_date": "2018-01-01", - "end_date": "now", + "end_date": "now", + "random_wait_pages": [1, 5], + "random_wait_seconds": [6, 10], "write_mode": ["csv", "txt"], "pic_download": 1, "video_download": 1, diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 30bed18e..a0b196e9 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -41,6 +41,16 @@ def __init__(self, config): self.since_date = since_date # 起始时间,即爬取发布日期从该值到结束时间的微博,形式为yyyy-mm-dd self.end_date = config[ 'end_date'] # 结束时间,即爬取发布日期从起始时间到该值的微博,形式为yyyy-mm-dd,特殊值"now"代表现在 + random_wait_pages = config['random_wait_pages'] + self.random_wait_pages = [ + min(random_wait_pages), + max(random_wait_pages) + ] + random_wait_seconds = config['random_wait_seconds'] + self.random_wait_seconds = [ + min(random_wait_seconds), + max(random_wait_seconds) + ] self.write_mode = config[ 'write_mode'] # 结果信息保存类型,为list形式,可包含txt、csv、json、mongo和mysql五种类型 self.pic_download = config[ @@ -113,7 +123,7 @@ def get_weibo_info(self): self.cookie, self.user_config['user_uri']).get_page_num() # 获取微博总页数 page1 = 0 - random_pages = random.randint(1, 5) + random_pages = random.randint(*self.random_wait_pages) for page in tqdm(range(1, page_num + 1), desc='Progress'): weibos, self.weibo_id_list = PageParser( self.cookie, @@ -136,9 +146,9 @@ def get_weibo_info(self): # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默 # 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间 if (page - page1) % random_pages == 0 and page < page_num: - sleep(random.randint(6, 10)) + sleep(random.randint(*self.random_wait_seconds)) page1 = page - random_pages = random.randint(1, 5) + random_pages = random.randint(*self.random_wait_pages) except Exception as e: logger.exception(e) @@ -213,13 +223,13 @@ def start(self): u'没有配置有效的user_id,请通过config.json或user_id_list.txt配置user_id') return user_count = 0 - user_count1 = random.randint(1, 5) - random_users = random.randint(1, 5) + user_count1 = random.randint(*self.random_wait_pages) + random_users = random.randint(*self.random_wait_pages) for user_config in self.user_config_list: if (user_count - user_count1) % random_users == 0: - sleep(random.randint(6, 10)) + sleep(random.randint(*self.random_wait_seconds)) user_count1 = user_count - random_users = random.randint(1, 5) + random_users = random.randint(*self.random_wait_pages) user_count += 1 self.get_user_info(user_config['user_uri']) logger.info(self.user) From 0b46258ac2d95edecf3007358aaef71fd544dd61 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sat, 25 Jul 2020 19:55:49 +0800 Subject: [PATCH 256/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=AF=B9rand?= =?UTF-8?q?om=5Fwait=5Fpages=E5=92=8Crandom=5Fwait=5Fseconds=E5=8F=82?= =?UTF-8?q?=E6=95=B0=E6=98=AF=E5=90=A6=E6=AD=A3=E7=A1=AE=E7=9A=84=E6=A3=80?= =?UTF-8?q?=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/config_util.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 8dfd4421..311f5877 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -41,6 +41,26 @@ def validate_config(config): logger.warning(u'end_date值应为yyyy-mm-dd形式或"now",请重新输入') sys.exit() + # 验证random_wait_pages + random_wait_pages = config['random_wait_pages'] + if not isinstance(random_wait_pages, list): + logger.warning(u'random_wait_pages参数值应为list类型,请重新输入') + sys.exit() + if (not isinstance(min(random_wait_pages), int)) or (not isinstance( + max(random_wait_pages), int)): + logger.warning(u'random_wait_pages列表中的值应为整数类型,请重新输入') + sys.exit() + + # 验证random_wait_seconds + random_wait_seconds = config['random_wait_seconds'] + if not isinstance(random_wait_seconds, list): + logger.warning(u'random_wait_seconds参数值应为list类型,请重新输入') + sys.exit() + if (not isinstance(min(random_wait_seconds), int)) or (not isinstance( + max(random_wait_seconds), int)): + logger.warning(u'random_wait_seconds列表中的值应为整数类型,请重新输入') + sys.exit() + # 验证write_mode write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql'] if not isinstance(config['write_mode'], list): From 71a558da4cb48629c07cd50f1d89e8128a83074b Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sun, 26 Jul 2020 18:13:53 +0800 Subject: [PATCH 257/363] Update settings.md --- docs/settings.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/settings.md b/docs/settings.md index 0d0fa389..65f2f7f1 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -10,6 +10,8 @@ $ python3 -m weibo_spider "filter": 1, "since_date": "2018-01-01", "end_date": "now", + "random_wait_pages": [1, 5], + "random_wait_seconds": [6, 10], "write_mode": ["csv", "txt"], "pic_download": 1, "video_download": 1, @@ -60,6 +62,10 @@ since_date值可以是日期,也可以是整数。如果是日期,代表爬 **since_date是所有user的爬取起始时间,非常不灵活。如果你要爬多个用户,并且想单独为每个用户设置一个since_date,可以使用[定期自动爬取微博](https://github.com/dataabc/weiboSpider/blob/master/docs/automation.md)方法二中的方法,该方法可以为多个用户设置不同的since_date,非常灵活。**
**设置end_date**
end_date值可以是日期,也可以是"now"。如果是日期,代表爬取该日期之前的微博,格式应为“yyyy-mm-dd”;如果是"now",代表爬取发布日期从since_date到现在的微博。since_date配合end_date,表示爬取发布日期在since_date和end_date之间的微博,包含边界。since_date是起始日期,end_date是结束日期,因此end_date时间应晚于since_date。注意,since_date即可以通过config.json文件的since_date参数设置,也可以通过user_id_list.txt设置;而end_date只能通过config.json文件的end_date参数设置,是全局变量,所有user_id都使用同一个end_date。当end_date值不是"now"时,程序无法获取微博中的视频,如果想要获取视频,请为end_date赋值为"now"。
+**设置random_wait_pages**
+random_wait_pages值是一个长度为2的整数列表,代表每爬取x页微博暂停一次,x为整数,值在random_wait_pages列表两个整数之间随机获取。默认值为[1, 5],代表每爬取1到5页暂停一次,如果程序被限制,可以加快暂停频率,即适当减小random_wait_pages内的值。
+**设置random_wait_seconds**
+random_wait_seconds值是一个长度为2的整数列表,代表每次暂停sleep x 秒,x为整数, 值在random_wait_seconds列表两个整数之间随机获取。默认值为[6, 10],代表每次暂停sleep 6到10秒,如果程序被限制,可以增加等待时间,即适当增大random_wait_seconds内的值。
**设置write_mode**
write_mode控制结果文件格式,取值范围是csv、txt、json、mongo和mysql,分别代表将结果文件写入csv、txt、json、MongoDB和MySQL数据库。write_mode可以同时包含这些取值中的一个或几个,如: ``` From 007cc392ed36d067b19afafe566a2b9526d3def0 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sun, 26 Jul 2020 18:19:48 +0800 Subject: [PATCH 258/363] Update FAQ.md --- docs/FAQ.md | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index 74d56bcb..04ecfec0 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -7,33 +7,12 @@ python3 -m weibo_spider ``` ### 2.程序运行出错,错误提示中包含“'NoneType' object”字样,如何解决? -这是最常见的问题之一。出错原因是爬取速度太快,被暂时限制了。限制可能包含爬虫账号限制和ip限制。一般情况下,一段时间后限制会自动解除。可通过降低爬取速度避免被限制,具体修改weibo_spider.py文件中get_weibo_info方法的如下代码: +这是最常见的问题之一。出错原因是爬取速度太快,被暂时限制了,限制可能包含爬虫账号限制和ip限制。一般情况下,一段时间后限制会自动解除。可通过降低爬取速度避免被限制,具体修改config.json文件中的如下代码: ``` - if (page - page1) % random_pages == 0 and page < page_num: - sleep(random.randint(6, 10)) - page1 = page - random_pages = random.randint(1, 5) + "random_wait_pages": [1, 5], + "random_wait_seconds": [6, 10], ``` -上面的意思是每爬取1到5页,随机等待6到10秒。可以通过加快暂停频率(减小random_pages值)或增加等待时间(加大sleep内的值)避免被限制。 -如果你设置了只爬取用户信息(不爬用户的微博),则需修改weibo_spider.py文件中的start方法,原来的代码是这样的: -``` - for user_config in self.user_config_list: - ...... -``` -修改后的代码是这样的: -``` - user_count = 0 - user_count1 = random.randint(1, 5) - random_users = random.randint(1, 5) - for user_config in self.user_config_list: - if (user_count - user_count1) % random_users == 0: - sleep(random.randint(6, 10)) - user_count1 = user_count - random_users = random.randint(1, 5) - user_count += 1 - ...... -``` -上面的意思是每爬1到5个用户,随机等待6到10秒,你可以根据实际情况,修改代码中的数字。 +上面的意思是每爬取1到5页,随机等待6到10秒。可以通过加快暂停频率(减小random_wait_pages内的值)或增加等待时间(加大random_wait_seconds内的值)避免被限制。 ### 3.如何获取微博评论? 因为限制,只能获取一部分评论,无法获取全部,因此暂时没有添加获取评论功能的计划。 From 01e82324ea71d1c55982cebda960750e93b741c4 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sun, 26 Jul 2020 18:33:51 +0800 Subject: [PATCH 259/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E5=8F=82?= =?UTF-8?q?=E6=95=B0random=5Fwait=5Fpages=E3=80=81random=5Fwait=5Fseconds?= =?UTF-8?q?=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/config_util.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 311f5877..03b4ad15 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -50,6 +50,9 @@ def validate_config(config): max(random_wait_pages), int)): logger.warning(u'random_wait_pages列表中的值应为整数类型,请重新输入') sys.exit() + if min(random_wait_pages) < 1: + logger.warning(u'random_wait_pages列表中的值应大于0,请重新输入') + sys.exit() # 验证random_wait_seconds random_wait_seconds = config['random_wait_seconds'] @@ -60,6 +63,9 @@ def validate_config(config): max(random_wait_seconds), int)): logger.warning(u'random_wait_seconds列表中的值应为整数类型,请重新输入') sys.exit() + if min(random_wait_seconds) < 1: + logger.warning(u'random_wait_seconds列表中的值应大于0,请重新输入') + sys.exit() # 验证write_mode write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql'] From 9ceb9f62232124f71b4919eb8fe576142b6687c8 Mon Sep 17 00:00:00 2001 From: dataabc Date: Tue, 28 Jul 2020 21:20:37 +0800 Subject: [PATCH 260/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=85=A8?= =?UTF-8?q?=E5=B1=80=E7=AD=89=E5=BE=85=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 通过config.json的global_wait参数配置,如[ [1000, 3600], [500, 2000]]代表爬取1000页等待3600秒,然后爬取500页,再等待2000秒。可以设置多个等待配置,如果等待配置用完如前面的配置,在爬取1500页之后配置就用完了,这时会使用第一个配置即[1000, 3600],然后再使用[500, 2000],以此类推 Issue #192 --- weibo_spider/config_sample.json | 1 + weibo_spider/spider.py | 22 +++++++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/weibo_spider/config_sample.json b/weibo_spider/config_sample.json index f7449dd7..35e0cb0f 100644 --- a/weibo_spider/config_sample.json +++ b/weibo_spider/config_sample.json @@ -5,6 +5,7 @@ "end_date": "now", "random_wait_pages": [1, 5], "random_wait_seconds": [6, 10], + "global_wait": [[1000, 3600], [500, 2000]], "write_mode": ["csv", "txt"], "pic_download": 1, "video_download": 1, diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index a0b196e9..a4cdc4a2 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -45,12 +45,14 @@ def __init__(self, config): self.random_wait_pages = [ min(random_wait_pages), max(random_wait_pages) - ] + ] # 随机等待频率,即每爬多少页暂停一次 random_wait_seconds = config['random_wait_seconds'] self.random_wait_seconds = [ min(random_wait_seconds), max(random_wait_seconds) - ] + ] # 随机等待时间,即每次暂停要sleep多少秒 + self.global_wait = config['global_wait'] # 配置全局等待时间,如每爬1000页等待3600秒等 + self.page_count = 0 # 统计每次全局等待后,爬取了多少页,若页数满足全局等待要求就进入下一次全局等待 self.write_mode = config[ 'write_mode'] # 结果信息保存类型,为list形式,可包含txt、csv、json、mongo和mysql五种类型 self.pic_download = config[ @@ -108,8 +110,9 @@ def write_user(self, user): writer.write_user(user) def get_user_info(self, user_uri): - # 获取用户信息、微博数、关注数、粉丝数 + """获取用户信息""" self.user = IndexParser(self.cookie, user_uri).get_user() + self.page_count += 1 def get_weibo_info(self): """获取微博信息""" @@ -122,6 +125,13 @@ def get_weibo_info(self): page_num = IndexParser( self.cookie, self.user_config['user_uri']).get_page_num() # 获取微博总页数 + self.page_count += 1 + if self.page_count > 2 and (self.page_count + + page_num) > self.global_wait[0][0]: + sleep(self.global_wait[0][1] * self.page_count / + self.global_wait[0][0]) + self.page_count = 0 + self.global_wait.append(self.global_wait.pop(0)) page1 = 0 random_pages = random.randint(*self.random_wait_pages) for page in tqdm(range(1, page_num + 1), desc='Progress'): @@ -137,6 +147,7 @@ def get_weibo_info(self): page, '-' * 30, ) + self.page_count += 1 if weibos: yield weibos else: @@ -149,6 +160,11 @@ def get_weibo_info(self): sleep(random.randint(*self.random_wait_seconds)) page1 = page random_pages = random.randint(*self.random_wait_pages) + + if self.page_count >= self.global_wait[0][0]: + sleep(self.global_wait[0][1]) + self.page_count = 0 + self.global_wait.append(self.global_wait.pop(0)) except Exception as e: logger.exception(e) From 500f344a979fd6b7f690cf8a923fc4b990e2f876 Mon Sep 17 00:00:00 2001 From: dataabc Date: Tue, 28 Jul 2020 22:07:45 +0800 Subject: [PATCH 261/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=AF=B9glob?= =?UTF-8?q?al=5Fwait=E5=8F=82=E6=95=B0=E6=AD=A3=E7=A1=AE=E6=80=A7=E7=9A=84?= =?UTF-8?q?=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/config_util.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 03b4ad15..967b8fe1 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -67,6 +67,23 @@ def validate_config(config): logger.warning(u'random_wait_seconds列表中的值应大于0,请重新输入') sys.exit() + # 验证global_wait + global_wait = config['global_wait'] + if not isinstance(global_wait, list): + logger.warning(u'global_wait参数值应为list类型,请重新输入') + sys.exit() + for g in global_wait: + if not isinstance(g, list): + logger.warning(u'global_wait参数内的值应为长度为2的list类型,请重新输入') + sys.exit() + if len(g) != 2: + logger.warning(u'global_wait参数内的list长度应为2,请重新输入') + sys.exit() + for i in g: + if (not isinstance(i, int)) or i < 1: + logger.warning(u'global_wait列表中的值应为大于0的整数,请重新输入') + sys.exit() + # 验证write_mode write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql'] if not isinstance(config['write_mode'], list): From e2199f172e98eb770adb9196e592bd02cbab9b07 Mon Sep 17 00:00:00 2001 From: dataabc Date: Tue, 28 Jul 2020 22:34:11 +0800 Subject: [PATCH 262/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E5=85=A8?= =?UTF-8?q?=E5=B1=80=E7=AD=89=E5=BE=85=E6=8F=90=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/spider.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index a4cdc4a2..9d1e8aa7 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -128,8 +128,12 @@ def get_weibo_info(self): self.page_count += 1 if self.page_count > 2 and (self.page_count + page_num) > self.global_wait[0][0]: - sleep(self.global_wait[0][1] * self.page_count / - self.global_wait[0][0]) + wait_seconds = int( + self.global_wait[0][1] * + min(1, self.page_count / self.global_wait[0][0])) + logger.info(u'即将进入全局等待时间,%d秒后程序继续执行' % wait_seconds) + for i in tqdm(range(wait_seconds)): + sleep(1) self.page_count = 0 self.global_wait.append(self.global_wait.pop(0)) page1 = 0 From 1b14ec7e15bf963a7718edad06c86d0ea1db6d63 Mon Sep 17 00:00:00 2001 From: dataabc Date: Tue, 28 Jul 2020 22:41:59 +0800 Subject: [PATCH 263/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E5=A4=9A?= =?UTF-8?q?=E5=BE=AE=E5=8D=9A=E7=94=A8=E6=88=B7=E5=85=A8=E5=B1=80=E7=AD=89?= =?UTF-8?q?=E5=BE=85=E6=8F=90=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/spider.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 9d1e8aa7..fb3f43b6 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -166,7 +166,10 @@ def get_weibo_info(self): random_pages = random.randint(*self.random_wait_pages) if self.page_count >= self.global_wait[0][0]: - sleep(self.global_wait[0][1]) + logger.info(u'即将进入全局等待时间,%d秒后程序继续执行' % + self.global_wait[0][1]) + for i in tqdm(range(self.global_wait[0][1])): + sleep(1) self.page_count = 0 self.global_wait.append(self.global_wait.pop(0)) except Exception as e: From 4723423fe7960ab1782ca3218026c789413c2382 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 28 Jul 2020 22:57:47 +0800 Subject: [PATCH 264/363] Update settings.md --- docs/settings.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/settings.md b/docs/settings.md index 65f2f7f1..3bcc6ece 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -12,6 +12,7 @@ $ python3 -m weibo_spider "end_date": "now", "random_wait_pages": [1, 5], "random_wait_seconds": [6, 10], + "global_wait": [[1000, 3600], [500, 2000]], "write_mode": ["csv", "txt"], "pic_download": 1, "video_download": 1, @@ -66,6 +67,8 @@ end_date值可以是日期,也可以是"now"。如果是日期,代表爬取 random_wait_pages值是一个长度为2的整数列表,代表每爬取x页微博暂停一次,x为整数,值在random_wait_pages列表两个整数之间随机获取。默认值为[1, 5],代表每爬取1到5页暂停一次,如果程序被限制,可以加快暂停频率,即适当减小random_wait_pages内的值。
**设置random_wait_seconds**
random_wait_seconds值是一个长度为2的整数列表,代表每次暂停sleep x 秒,x为整数, 值在random_wait_seconds列表两个整数之间随机获取。默认值为[6, 10],代表每次暂停sleep 6到10秒,如果程序被限制,可以增加等待时间,即适当增大random_wait_seconds内的值。
+**设置global_wait**
+global_wait控制全局等待时间,默认值为[[1000, 3600], [500, 2000]],代表获取1000页微博,程序一次性暂停3600秒;之后获取500页微博,程序再一次性暂停2000秒;之后如果再获取1000页微博,程序一次性暂停3600秒,以此类推。默认的只有前面的两个全局等待时间,可以设置多个,如值可以为[[1000, 3600], [500, 3000], [700, 3600]],程序会根据配置依次等待对应时间,如果配置全部被使用,程序会从第一个配置开始,依次使用,循环往复。
**设置write_mode**
write_mode控制结果文件格式,取值范围是csv、txt、json、mongo和mysql,分别代表将结果文件写入csv、txt、json、MongoDB和MySQL数据库。write_mode可以同时包含这些取值中的一个或几个,如: ``` From f8b8c4dce0d4e7b7810f625fe26602c2e8a89a12 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 29 Jul 2020 19:58:37 +0800 Subject: [PATCH 265/363] Update FAQ.md --- docs/FAQ.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index 04ecfec0..e6c6e744 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -11,8 +11,9 @@ python3 -m weibo_spider ``` "random_wait_pages": [1, 5], "random_wait_seconds": [6, 10], + "global_wait": [[1000, 3600], [500, 2000]], ``` -上面的意思是每爬取1到5页,随机等待6到10秒。可以通过加快暂停频率(减小random_wait_pages内的值)或增加等待时间(加大random_wait_seconds内的值)避免被限制。 +前两行的意思是每爬取1到5页,随机等待6到10秒。可以通过加快暂停频率(减小random_wait_pages内的值)或增加等待时间(加大random_wait_seconds内的值)避免被限制。最后一行的意思是获取1000页微博,一次性等待3600秒;之后获取500页微博一次性等待2000秒。默认只有两个global_wait配置,可以添加更多个,也可以自定义。当配置使用完,如默认配置在获取1500(1000+500)页微博后就用完了,之后程序会从第一个配置开始循环使用。 ### 3.如何获取微博评论? 因为限制,只能获取一部分评论,无法获取全部,因此暂时没有添加获取评论功能的计划。 From ef3e4e746d738bce48565199cd3e1047c497e728 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 29 Jul 2020 20:37:07 +0800 Subject: [PATCH 266/363] Update FAQ.md --- docs/FAQ.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index e6c6e744..bdefb3d5 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -13,7 +13,7 @@ python3 -m weibo_spider "random_wait_seconds": [6, 10], "global_wait": [[1000, 3600], [500, 2000]], ``` -前两行的意思是每爬取1到5页,随机等待6到10秒。可以通过加快暂停频率(减小random_wait_pages内的值)或增加等待时间(加大random_wait_seconds内的值)避免被限制。最后一行的意思是获取1000页微博,一次性等待3600秒;之后获取500页微博一次性等待2000秒。默认只有两个global_wait配置,可以添加更多个,也可以自定义。当配置使用完,如默认配置在获取1500(1000+500)页微博后就用完了,之后程序会从第一个配置开始循环使用。 +前两行的意思是每爬取1到5页,随机等待6到10秒。可以通过加快暂停频率(减小random_wait_pages内的值)或增加等待时间(加大random_wait_seconds内的值)避免被限制。最后一行的意思是获取1000页微博,一次性等待3600秒;之后获取500页微博一次性等待2000秒。默认只有两个global_wait配置([1000, 3600]和[500, 2000]),可以添加更多个,也可以自定义。当配置使用完,如默认配置在获取1500(1000+500)页微博后就用完了,之后程序会从第一个配置开始循环使用(获取第1501页到2500页等待3600秒,获取第2501页到第3000页等待2000秒,以此类推)。 ### 3.如何获取微博评论? 因为限制,只能获取一部分评论,无法获取全部,因此暂时没有添加获取评论功能的计划。 From bca8b4f30fab4ea064ef570d2daf97ca6070ff72 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 29 Jul 2020 20:39:50 +0800 Subject: [PATCH 267/363] Update settings.md --- docs/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/settings.md b/docs/settings.md index 3bcc6ece..37bf2aab 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -68,7 +68,7 @@ random_wait_pages值是一个长度为2的整数列表,代表每爬取x页微 **设置random_wait_seconds**
random_wait_seconds值是一个长度为2的整数列表,代表每次暂停sleep x 秒,x为整数, 值在random_wait_seconds列表两个整数之间随机获取。默认值为[6, 10],代表每次暂停sleep 6到10秒,如果程序被限制,可以增加等待时间,即适当增大random_wait_seconds内的值。
**设置global_wait**
-global_wait控制全局等待时间,默认值为[[1000, 3600], [500, 2000]],代表获取1000页微博,程序一次性暂停3600秒;之后获取500页微博,程序再一次性暂停2000秒;之后如果再获取1000页微博,程序一次性暂停3600秒,以此类推。默认的只有前面的两个全局等待时间,可以设置多个,如值可以为[[1000, 3600], [500, 3000], [700, 3600]],程序会根据配置依次等待对应时间,如果配置全部被使用,程序会从第一个配置开始,依次使用,循环往复。
+global_wait控制全局等待时间,默认值为[[1000, 3600], [500, 2000]],代表获取1000页微博,程序一次性暂停3600秒;之后获取500页微博,程序再一次性暂停2000秒;之后如果再获取1000页微博,程序一次性暂停3600秒,以此类推。默认的只有前面的两个全局等待时间([1000, 3600]和[500, 2000]),可以设置多个,如值可以为[[1000, 3600], [500, 3000], [700, 3600]],程序会根据配置依次等待对应时间,如果配置全部被使用,程序会从第一个配置开始,依次使用,循环往复。
**设置write_mode**
write_mode控制结果文件格式,取值范围是csv、txt、json、mongo和mysql,分别代表将结果文件写入csv、txt、json、MongoDB和MySQL数据库。write_mode可以同时包含这些取值中的一个或几个,如: ``` From bb8db00b613599c1c0b1c5ac97d9a1dc8f081538 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Thu, 30 Jul 2020 14:38:28 +0800 Subject: [PATCH 268/363] Update config_sample.json --- weibo_spider/config_sample.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/weibo_spider/config_sample.json b/weibo_spider/config_sample.json index 35e0cb0f..39322039 100644 --- a/weibo_spider/config_sample.json +++ b/weibo_spider/config_sample.json @@ -5,7 +5,7 @@ "end_date": "now", "random_wait_pages": [1, 5], "random_wait_seconds": [6, 10], - "global_wait": [[1000, 3600], [500, 2000]], + "global_wait": [[1000, 3600], [500, 2000]], "write_mode": ["csv", "txt"], "pic_download": 1, "video_download": 1, @@ -17,4 +17,4 @@ "password": "123456", "charset": "utf8mb4" } -} \ No newline at end of file +} From 332dd2509d7ef779d535a411b98c070a357fa2b5 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sat, 1 Aug 2020 20:34:04 +0800 Subject: [PATCH 269/363] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f2e3dbe5..3d5da8f9 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -![Build Status](https://github.com/dataabc/weiboSpider/workflows/Python%20application/badge.svg) +[![Build Status](https://github.com/dataabc/weiboSpider/workflows/Python%20application/badge.svg)](https://badge.fury.io/py/weibo-spider) +[![Python](https://img.shields.io/pypi/pyversions/weibo-spider)](https://badge.fury.io/py/weibo-spider) +[![PyPI](https://badge.fury.io/py/weibo-spider.svg)](https://badge.fury.io/py/weibo-spider) # Weibo Spider From d92e585b0a18ea54fcbd3ec7d5584edcffe6cf28 Mon Sep 17 00:00:00 2001 From: dataabc Date: Thu, 6 Aug 2020 21:11:35 +0800 Subject: [PATCH 270/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96since=5Fdate?= =?UTF-8?q?=E5=8F=82=E6=95=B0=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/config_util.py | 4 ++-- weibo_spider/spider.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 967b8fe1..9605ce9e 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -30,8 +30,8 @@ def validate_config(config): sys.exit() # 验证since_date - since_date = str(config['since_date']) - if (not _is_date(since_date)) and (not since_date.isdigit()): + since_date = config['since_date'] + if (not _is_date(str(since_date))) and (not isinstance(since_date, int)): logger.warning(u'since_date值应为yyyy-mm-dd形式或整数,请重新输入') sys.exit() diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index fb3f43b6..ee9eb43a 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -35,10 +35,11 @@ def __init__(self, config): """Weibo类初始化""" self.filter = config[ 'filter'] # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 - since_date = str(config['since_date']) - if since_date.isdigit(): - since_date = str(date.today() - timedelta(int(since_date))) - self.since_date = since_date # 起始时间,即爬取发布日期从该值到结束时间的微博,形式为yyyy-mm-dd + since_date = config['since_date'] + if isinstance(since_date, int): + since_date = date.today() - timedelta(since_date) + self.since_date = str( + since_date) # 起始时间,即爬取发布日期从该值到结束时间的微博,形式为yyyy-mm-dd self.end_date = config[ 'end_date'] # 结束时间,即爬取发布日期从起始时间到该值的微博,形式为yyyy-mm-dd,特殊值"now"代表现在 random_wait_pages = config['random_wait_pages'] @@ -119,8 +120,7 @@ def get_weibo_info(self): try: since_date = datetime_util.str_to_time( self.user_config['since_date']) - now = datetime.now().strftime('%Y-%m-%d %H:%M') - now = datetime.strptime(now, '%Y-%m-%d %H:%M') + now = datetime.now() if since_date <= now: page_num = IndexParser( self.cookie, From d16adba16a9acc0acf08495e79218d93b9d4242c Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Thu, 13 Aug 2020 23:52:24 +0800 Subject: [PATCH 271/363] Let git ignore log files. --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 78d8e9a8..0dc0cdf5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .vscode + *.pyc __pycache__ @@ -6,5 +7,7 @@ build/ dist/ *.egg-info -weibo/ config.json + +weibo/ +*.log \ No newline at end of file From 3c4cf55ae31cc329a232f44d234b0ae0c9c876d5 Mon Sep 17 00:00:00 2001 From: liandy Date: Thu, 3 Sep 2020 10:11:29 +0800 Subject: [PATCH 272/363] =?UTF-8?q?=E6=B7=BB=E5=8A=A0sqlite=E6=94=AF?= =?UTF-8?q?=E6=8C=81=EF=BC=8C=E5=8F=AA=E9=9C=80=E8=A6=81=E5=9C=A8config.js?= =?UTF-8?q?on=E4=B8=AD=E7=AE=80=E5=8D=95=E9=85=8D=E7=BD=AE=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加sqlite支持,只需要在config.json中简单配置。 --- weibo.db | Bin 0 -> 61440 bytes weibo_spider/config_sample.json | 3 +- weibo_spider/config_util.py | 4 +- weibo_spider/spider.py | 15 +++- weibo_spider/writer/__init__.py | 3 +- weibo_spider/writer/sqlite_writer.py | 109 +++++++++++++++++++++++++++ 6 files changed, 129 insertions(+), 5 deletions(-) create mode 100644 weibo.db create mode 100644 weibo_spider/writer/sqlite_writer.py diff --git a/weibo.db b/weibo.db new file mode 100644 index 0000000000000000000000000000000000000000..dae64a83e5166c38c38fa5ffcdd8ed4f2fd467b4 GIT binary patch literal 61440 zcmeIb32(@d`tDga@CX{msM433R;||;0HYSp=!(4U8*frm6hM0_FyDr3m(XRTKI7KLl&Pm zZ1aZGRSF!(&G48h74wu*DIpB9$!jYgYlzv`4bJamxIBK(n z=lp!?1*&yhw^o*wY)MlDEkdL#pnu&zB*hdZt!H{0F>|8ddog4DCyYH$}JHPg;C6 zH5lVL)VXa(`R0-x`&66C_NhK_SwEb{Qy*@NP{4!L)&Wx_?32p} zWS%|1%l}(^aP9mhZ@=}{{8o$1=fsZmZ^0xkr#rJVxiBk}dtDZ{*MtE`&50Qa>ad}? z!SpkjH`9m5w%!n6v}%0%uk0+o+-0+h9=sqkvVlG>I=Hl-I%$zY(Nzjn@%&k&*Sk$U0w(A>dQX1 z1w%7`oSWOeyI}F#KX~h9+$IY+o_;TcT3Je}g%P$c?#qykf%VN0(lDLDUhc?o&tFa z_2@@u7YLwN4SC1bNDm=lczwQ0(lDL zDUhc?o&tFaH%uQv#SuyWzM^E0K_hw5jS#iN(Ro)ZgT=-p=aPC*^xX5#6S&URCav&i0Ad zy97Rd@>$|qOZ;X-v1)DN*?@HRgm|kler{ByiXXj=u8J>O#pey;cr*S+2gU2RKOeiO z%6?LQs9)?nAMbfA9lwwMei|P=hz^S_4|L-5%Rin@fGD4X08R(zt~T5B)q>g@)d~?! zAV>uGeu{!~slXf4n5Y7WL#y*exVp&|XmYn8#M|ne7y@sdU^DRU`pFdtxhsfhg0(hO zpmFfdkR}kATtO}-01fvj|$(+Fp<;d*SL2$bQvbXY)A||mU`R7!9 zlTJPrdq+NNxPh5EF?ugCewL>2kv23D%@v> zn;1DXiJOft;pQA8>EZ=#K&?|pss%2>Asj90woD$$9Lz`+_ygg(aH;~XZVEHbp@M)p zW~xn9pwY+zB{QGq(gkwCl{(sTP=QQ$}Gf71Y?PO{o@mHT>ijo6(h8xnbl18)73hA-^oWn16R+PC`=2JR^ocDk z&?^%oJy5gBtH%<}N5t-hSfZ0i({j!q1nrYSoydRpwcD7FqJo32q6qgN7RV{~Wy5Y^jzN$ee*7;2B-KPFzd zkT`Kr8hoshE)GcdN5$*sQ5jX=d^qzoA5OI16AvAS=tAAn>A^(%m`ZGUD)wI# z`<~J@{8V~)1rjfQV-%WF10$ACSA29#I{88+tWI=viN}wy z&XDwE9q73{F*E{NNcXNxjC4q+T2zUvccs=l)U%O8;ESu)`DK-Ky;Hh;EZJ~bx_Cw<_P0vy*OQ~i#7jq2>eUlty?77a znHWR8BWT^k=(WU^TdH{1WvQc0ZrB5SkQncdpS%^nc}%>21MQa{ox(>_?}&7DP&{&4 zYU{w*s4$F)n+K&^SK>E2>7`fi<5DHhT}O)(&12&AOUbhj(I^%Bgh}8;M`Pkd8;!*P zY>H78`kZ+3jMacIC5}E5yH1N;1BshWcql&FB|SMRoobEuw9}&#LoX(Vu8S|usEV~x zZ!20JKQ|`czgDb*K7pPQyN^M9$6uhwXK{&rZSf=h)Jt^f@MG!JJ(b8PJ$f2CLGJ!p zjIIi*DA9i#b>Obl221B4wXk=7NP# zhD|$Zwl(SBT=fx6H~S_}*k8J9OJ=$y8V`xx_oRnyAV+$Beb&$`oxSu~!v(SZ7UK{Y zvKZmwjV@o%;qs|Ukon8&DrSQtwRX$Q2~wPjx5|36_e^47G||xV^MW&IAQq`(Vw49wmee^BkO+pP6+0)$?sZS=Zm-_s}XMh zk(_TJ^byhs)Y*a`*Ek_?GLP0*7TIU5-@Vo3-k{mFd&mAQW!iPycJYSI zpG0H!(z590(yFMvv}7xX|K2Z)`CZ7O@G-A78hMV_8Z_^RLN3S0d&>gq@;bM9ujBnq z<)t4-ysl8#7qop}esg*(46ArpBZf6i8UR2ux!(3s}~SiqvDM)N2d{a@w_MVdo1%UA2eC#6VET4#Pg25<(~RI z!oCcV(`&Qj;ZBm;{nrx%@(dp80H2fPd3InbtBWnSFxg2O-hx6JAtiAC$V}|5`upMK z{Z_SgtIbnYw~O0RvDy79ysT!ttgfuxZP;%%MJi|IWvru@mg3j4HTboh$FCLN#jk?z z;8)St@oUL4+SIRH@u%|&I||Pg-dypgEBw^#mnrF&)F%EK$)Td7`IvU2OnYbzhDJhk$VR{r~y<13RzuNA$y^8YLP zsOa6IkBc@H?JIH=MT(k>jurj8BC+T%iY^u1NAvPOc?#qykf%VN0(lDLDUhc?o&tFa z$zN;S@4rN;9^ zRZ8deauRp%AJDGv-P*-Qmt+a=h-0+rHYSzf5sHu$8eB!VhS*gA{ zV$$nv2DudD&Dv_sZnd73%xa5u)6Oa#E7|I;(T!z6tx{UAcI_368dloBJ62s;woWaV z>Kv6j4T7~sU?rDV?b2;1=UFMZIb35BDml4SUsJu`SXr{^Bc)Vct<`wySIec6+C63U zTHoGPa;em*SKIYfTfQ%s%GO6MA)T$}dsM36T|%ikY7YFCToSm=5nniL{E(H_Z7Yjy z^+!KorS<-r4gLz@yK+e#v)AbOlKtVij{)pL67>APu^jrvZ|foicOL4u+p}T zZcW|h=(kyEuhCMwyF~Nrth6s&6|1P;`YpMnt#kXGF^l2XSZU+7YM**n-Pc%Y?MRv}$WO-RrDWWwL5)>vctP$*4ByIrsk1N>{@JfR!R!DmQG4cvrAeEbJ|-4|tZ#rPP-6GFDQya+k`b)SlcD zR#G;>7E>v`J@A_RS;_^!NG_#3gbP_oaZoKNSoqGeij*g%5OQSMy#F|F<+(!5ioT`# zC7p}TF8tg1PjTxX#7}`z{QmzW-+O-lKfnK<-~Z3=|L6Dr^ZWlm z0_6Aq^ZWn#{r}k@xsw3|`Tc)7J7X3s%U8Dlze?a`EDJXsX0c@1f?r&kU$}|Z|I6p~ z&0G1M0_XAv%g!!&viQHh=37)g|F3cLf0m!7ZKc41G{2hyMiPj>JfQd*GhpCEV|y|1 z?$WvQ#VUY5iW5&R0zMYM(Nzp^2Z6?(T%=olClgN|i@mo1CXA1a6Zq@MsMvl*eAWlZ z9Dq&)^k+EY7d?rN8$g78{=5;`5<)Wp84uuK-vtJ1Og@(}eSkQ2x6z%}+k~y^8v~>+ zdEp`|NgTZk{1qMp{)*rz16>nCcK|s7#F0>Q!;i%NHnFi!9C?OU%!K940W~%2qk0?8 z#0daw9YU&YX`TiCwR@0R0P30GS*aa*k(vt zEVZ8z509ev;>C_c(@4DOC_~PHeu*nbfGI0h0n2nt8axKXcYN%2>Lw!)(X0PFf%VW; z*4usp1X6DSs*@f+oV4(ISs1~rpkuYeY&Hh{AueDg$dQk$F@#;}$(W^C$;c`Qxb)Qk z@hcU$!;>qR6}7BXVAk8ssR|si>d6(%fSSsLg0KoAdVioURe|0$B?@2;Er?iLgj!}@ z2-v5fmgiD|HE8stD&RFT@Ola>=_wdRt7U^%vTfVq^Iro)tr##g6|W*4DY z!x!Q`LlYwpL7rv^kxu*uY8c`}I(nRaLgH!cvZ~Z(3a$cBbz*2N-h5wtMlfgb1;y=5 z1z`e%Xy1w8H&kqyg6!JU*(B$`jRcRk^QmCtsKv3c$|0>s9zu9%1~Ne z**m2^GE%-676L`DD zso5!62y< zBX^TmhT}c`6dje8VABG>a><)WwO6OL$+O)Z;at9e-bLbKGXD?|l|aGixwd5c4;3!@ z-5=3hi^^Q|p29_`U9(KXi`lp%s14CE#l?(`CWd;&&Z83}$EAknV5Xr9Vs{UlQ&Pjx ziQ!g2gT*fJ+YP|aF*VU`!0Q3q2hU;R0Ol?pyNc&2%BOFPKD~QZJj8BZ%8H^JY6CzX zpnYKVrHiLzDEp1#FM&B@k$Qet^)9o_oV)8d%Q{DSiaD$K8r!dcIdzqr0#TRAv0>Y6 zU`&C4#T4kB z#>dh`#@J9a)`bT4TO53l>>Gy-Ljg~HV~EufyPM;UcatYu#8d4Mf5l>d%fxUCp1>ER zq1)1ba3R%!#X|kFE*>bg9NSB*h3+o2P&J_MY%v7@0J7irp*8f zS`%=0@n=UB-q(Cggn4H&?<>2)8N%dh^W?dqN?tfVF?1HOC~T5|tk6HwsdE$)akeXd z;Ywnl;j@O5xG3BM;UitA6V1I}0zdQXt9I}<&3>7m_v(Y&eVaF@`Po&rCl@~()nA^l z<(aV6xpDm6R*SVfho}XH8=_XLn~k3rUwNSnoBFqD*uFVQ@kNyF`>bJ8=-=I(Jk$Y^ zB`YpPfT=J%(1X(8NeVrbe}lWluCsVlywD@w?**}n#>o>_`wy)go zH##?^#?0>JV!6ib%T$ocm^t4QHt4+;wK}-TnnMa{IOqy&d zm_^p8Wy}@ChKMbLi4IsPu2T0=+ue^YqwI zXJY&?<|ghZ2A(0_I@y4@ppLI#jQ47FF`btCgvNNCZbQhsUmZw|@kW#LE7dc~7;mcB zq}BL8sdbglKEenl!n{<^Ixm-beQbmavBqCE@BMj&zhCe_mjA)BpDw=ontNgU{C|a8 zSwGG1tsy>`Kba2{fK3)3J7`6}LMjXxR{`yPp19qFY8W^KwYq_H?R2FnlYc#7a2 z7^a9DBn#*pW4|$d;ayRTOR@hlvoCYo48Qz3g*Vs4_HPXYl9^Bt1a83`)hT)us6pP)zhHzS{OD3 zcSL3EeqzAb!O5j84cXW!K()-+VW=qdwCimA#ZlSTyMz(IP(W73`y0g0Q@BYnPw|rv z#6vBKh9lzj(<(&4#}7W0dLH9T#dsc4HGZpuykdxoAG|7^9*B=VP7IvEXm5|7c)E6H z)D;TZ(zjJ4qgj-t9Ak=CPza^$&0_75|85W;HnMl1X4RBB z=tKewSSAQyK98vb8W1gq5dPc)vLUpb1Yhf6^3r1M1lRKU)1V^4zlow}o{9}19!S_H zwVjaK`r&Mro;Tr|80p7_VB_SOTdLyhBr2FKsWdFv`BU^SiT2DzR99`awl+|k&i4>? zIwrdv=TL!8XN{^cDlBD%OE_)LhPh3gp^?%joD`G!IBd1KNL&s5_bP7Ika@>6thEh8+hxNlw9j5<(p051uimgd z_{BBvD?5M%#Q|)9%2ltk={DMe^*J=J%pSZj+xoWX&uSRoG_F&_0sN0s^G|jF#}rh5 zhHWrBQf~*;q>9D|tCa2=lnx0-@b&Y_GyTYO^x1_Ivv`AN6@hAp#jnwN0;Yhh8aV{q zJG`<0f~3I3Qbed%@oZIROnZah`1-$9n177`=5KVTw(?}dF%^xF=|wS{?aauJx88wW zac_uSpFbzgqkI_51x&&L;q$rux!F4GWt(@oCVf!n($*SXqyS6-&SchWUM+!^1r_Lo zI-?I2AX$Me%GFNU{hO8k&|%a9w@r_n3B1D-bv(YZCOJcWM`MX%K3G z!6)wrU^59xk8D#~Foy~>VXt5T4V)pm>gD~%DJ+Cca$4l{Ar;iHnlVoyR{6{4zdf(; zS>ehR&gBiux|aUYk`0SCE>z*>|Gqy57xSc<*QXZJvO=j##q?y-2BzKa;_xZTXVf-4 z@d8_z>D>|J-h)p=y#7oY8k!jH7JKfC*L$UNwA><-v-S4G$a!gKKObb`CwxF@9}?3+rAU!{&v{9}7I2e!nv7!01Rvug+*)4)jnv01hVU;o43VDqEt zH#smuQ+Y~7FyX3%b7h1p^6p^FrL#woIFU5O;Va2AwDSsSn`nC|HDiBeB+=dkjh`61 zhctMw0mQC5$gqQtk|%G#-Gz@7-*?LU1(*rxo8}g2;AHYJl31Om=Lb$oFu|TZ$nEI< z0*kazwZ`Bu=_mnRFn}v;b56HpXH5l3ac)^;>N*xb zI!KIt{XBXS@4fW1XIQbq{3MJ>*8od7l=ef-Z7G^bL-_@P`<8k~KyR#dAmr}?lkfo3ayaHvK~zCRV*`3ah*$(&(Ce6j-pI9xK?O#a-iU=9A2#q| zW7s}r88wFrbYWMR7BO`$-XKK%uigR^j1^KrRDsJ;`)Zm0h=?YW$AfIBT)@Zk!RpAX zB?FZ!@Ng#N^28938fJ;@GT9I3Km)T{qrp;A@H_Zu)N7xK{f#pFlaJzSiGRk}zj@JW zk`iuG7fs)LQ_eCA_Gf{e8TL;$G$JJ)))eFnlR+Jr^%9cXVVVyPCB}QP?P`P*Ep2dO z0vHTBRqna2`gucpT21`Cp(7grvTG}Q7o57}*ucvOdxnkUt{dbAuN0BJ6orXkCsrdQxwl8MOJL+IM8Vs>;t>0V& z9t_uUA&1u}Pb0JvSIU+b^eWD{M$4sb`M2IRC_V6gA3fM!*i=|K-Lr8(C;DMIJ;nOv^kDL2H+-Op z{wv~8t9a}d(k>=%oCZHL3eoHW!0cHc`vx2uiJRA8K4A-yhJOYHnL|$7)Q*rEk?UBj z2m!k~9CF&^wZ`P$BA+b{uMT&=Hu~#|ged+kk`QvP-DW^`sGsQ&?yP=F7aNo3x>;U0 z^zaNaPoEaA+)WI0#_wK?-#>{iF?SGrF~7KUtPLrg#Ycn40QvcIa_y2|qH6$uCl7a_ z7(e$SdH4v5=roFA=Na)uBP>mN7zFs`&A{pnW!8vJh>@$H)Wn2!EqNN2jufrq+jaU5kpu&yse~9U~HJceLQjztUK@v~2 zPPs4=Ei%&S#lrPoX9Vn&N&s$}YHy*T@El?XknF8Dj@Gk=9^CK&*y&&-wLX#A zNl>Exvs(J1mGD_Dk6B(vz$?XTcj3{S7`hK%5}!*1!xW7+|1J(Es|2Tsun9Bs|Q zC~F-QPst9%lphf8KOi+Jqg!h$tM;-^suainnWr$slP4P?0SLs5O%A!G>E}r7zalzP z?W_sgcha_0sn@Yjw{@2_weYdTHd%fJ9A8sjQc_wK)LNrCINnHdiBl8Dn-$BY5NrHp z^Zwnu!qFA~Y1v;b{hKAdiwo2+rXAnK{J^GBcG`bI$%B!ZAI0ME5EwXd z^9JdOF08}bvAKin_JAHu`=nSUj@*D;#bzpCpx{Mh%Y`L1G2S0*tUafMpR9FrON%=St#W8zwm&jBOmFq|Bay2-~3D>goGm>v_KJ!FV6*d_6i8+41PEW6#|m}Y)1R0GqEQJIFN>skDA zBeuI?*dYpKh6(%28#r#W&0JGn@0JamI;*|L&#zC3`-&Zg4PRW`zw%xPZ(s&ag~Ow_ zlm&K%X2pTYBT4v7^=y8w#h?EN4DrIPf6@HLZ<4@#D>b)ix zC%cp;x}3dO>g~kNcJ>GGP)e;`(*0{liB4yWbi*!f#2O2854)5kMaSs^W6+Gj{SkP; z=zf}EWI?JtgYiyYxQd4f(R|@D+m<9I=srp~Lb)l#2We1##x3-+wb3ks(u1{}UC)_o zv53P-0=&-4yQ8n>)ln)4+H`(YpoyRYy?xro4u%OKH5vfC(u=_ro?UF(HT_O8#-vf3 z$}=s?+f-7f%$Q7uJ*c*A-Lgw%M4I!C#MKU@XupH`A{~PTg}nI?!(GLdyXg_*-11H5 z*m$rTb=9$2M*to}_|1^`Kc;)N-5XTkHrEPTa$Gsoe9Ymlp5iu|b(DBy^wL!ag`f{o zjaA_F+os#l0@NA5MoTvIihB|xZ)5B?h`fPGHiAaUa+M+PbkHI)q-#pxi--NpY-aE; zlAI^69cN+>T*=To41$7BX$WZ7ob-XJIQ8Xhvx7!sS{O+#fo`;DJTZe>=RnMt3zygL zSnY7x(Men>O-^0cvXfsH|5i1+zWC6+=D@q?`ax4_wlmIBQhH^&zLpLH=ou4l9Zp_) z0(cli`BPdD^pk^#-~nS7VawxV^jOCmit!gA0MS_-33}( z@zdn(Z^r$^(-vCH%bV|{%16go#gG!@BnAW@i-eY20PfJ+h5$kY2I`)4dJL_?*VEoC z+@y#F2E>t<4XidSInWe_lp{eQH!gntdAv^gTXJ|H?5MyP3JMgehzgLp-!g5dPrC}Ib>KuD zjv_vntM@W?Z{DK9n6pC!$OA!JS``{4aa(50U!ufGagQ8rwk8z4PHwtf7w-` zxDrxM1o{Yz1-XcMjd#j0o;xrw0A!URH;sySeY`o@K%hOE?DBazXab}GWCgfarTS&o z|9oV-!&EM>|2Ni{f+gE_rVK*YKF<8bx$BiZUYwp8gpL|tbXP=M;o)ar|EmE?g+VCC zfq%B0F;O8_`OD{@omcq(7L=~oviQx{ezM54@acm8w%{yo=6`-=6lnbEcgQ&LrHm8V z9@i+TG&7TXigBW>yV#0WdUho_N{8DJ(Dm9mpb~J}+1UXSzr!sw#jXt|FFm5kEnD+5 zO5?OO{c5%eY+tM3)N2Iz83dlE)8yRtX=a&@VA2$-k`+$=$jSKfz_*LudG~uH+Hh#& zd$OuhGKXe5Pnrha!S-8H11V=__M|Nz)DAy90%*DH=iWZ_ckBg?YknQB-bR!1iemW{ zN;=ZaD-`;AT(vg&>_owU?KzN)oq-4)wszVQlfd%KrKmjg9N7F098Fc?_pkb4wwzv1Zl%*=GM~@DICU&RkY8w_^LpI2Tz09#Iiy0jvd+cncxpdClRhd{E3EeAzF&s5AwMqv^h$$fZ;;3v-ZmEkTdadr#ar{c4V zM0t3OZ=Yk8kUVtzVy$!7Qs~X(oIWKzptnCTSJS^JAo@oEG0 z+8Um6XUJQSINf!!O`U5ANflIETZ);$X^MzbTnTgN0&nqG6OL9M1#8U|2k9KTAo!hXoIS}Za0_ybN`{EJ z87U|jczD>A3N%JpN@gm^&B{Tw&Znotv*a#>T~m(lnnMK|m)@P;&)1uxlSg3=G|&Xy z_B0=uoKtW>bErV8F;*K=U9i?oS>ev50++XrR=6?^oI2y=F65>`pw3&Pcc&@{c&5mM zIkZ4s>ocZD!EKgXzzhuLSJBD)$C@>A9Kr{V4-{n=dRvzFS@4aDRCe-=*nu% zgH$|=4px%dROpm6?3r%A#IrCl+JJBfIL6XykWP;z2hw%1Z9w+V(90p|?^}h08oguZPtJ_((^NTO(Ue(91HR{-s4&e?P_ifm_*UQ%kvn}ethH@4L z&N!Qw|Fy?&Vx*QX`_93lw=hx%Xa1unUNA(&ofUSSzR*NhP3v`2|8%C z_j&SEFVuScNG~q-DHJ~&MHxeOn8p%OEG>_KDyblbYC*)q9v~!5=T*RE z;9mCoLbha!eA?@^mQN>M9Qv^;ew*a|T9egfHHEx3A6#8`21Uk z#LY%xm{Zr*X1;m77tV%hu9O@LI+KqxA(+FC&1K53$_sAew9szQK;FS;e)Sdq7&KTE zUET721u_2mNkN%D<;blJ4JI?biegG-j~&}wBb@&6BUx>%Bt=*U<6GPb7q3~>} zs~g#DlV@mpAWZ=rU-%7I7uXpyiRGV#ZA6k9Hwm^d48~?=kYLoB+m>u>d3wJv&)&~%gX{lbF8*Z!C(!q_c@WofFJ~B zi_=>B3dSLa z-ZN#0NdeH^3{-N7#o!?yx3T_3kXdB2f)U3rvZDJ zjY>Y6qqoR8%s5A@TJi!fFk3W zf0!bE3^A_0ygq^ zd=v?3&&1nWCq^kFwAgqasSk>iW8K(p5IegVg_368o`LUX)*AU<}7b}OD;BEiD|uLJ_F{RwZlTib@8HGR-vWEAB}{zJ7<-S@-iP<7)@Hvoaj{dpa~w=X z4~{o~5p7a$H`jz}^pJFv-$Jdi8Dz)Ql!1XZ!DOOIy6~TuVqgmXyVSgT18I{V(PmXT zzASYRW7fdToFe48jA*9h1vGn7;iXEnsjO@j=p~pKIVcbdC1+nljsWR?AJfAO&m`Tt zg01$P(Zz^Go3SB?ON)=TQiRLZg9siVBMnoBAqo+uF!B5nK@kpjLItt!$D78a#tu*# zwD@u((VP^CrZ@09Q_ZwUG_{)fU^Kyf8cihX)>&pZfdrU9*k+{tp2dIrPuWare7}r# z7iGH_86AVc##A-#f41@0S0IWoMRFy!OWne;dX8&;JAzX#V!^V%9f)Sa^_a zgUiu?X0A5uwQSpz;UXxrHYdJJ1>0GA*VQM2LZh+Dvp_qiC#wBZzKI!~z0tPqAQUM({j9A#jUAEka|m)ds2Y-dl=?a(KimEEp|Obnt33uk~e_E#w(6?A#XvV^HxWlRcmLw|0M)4a-7D- zRYbSa;ybowe;}l zPF;n?1de85OkldTQ(ix(h2*UAfOqKYv2CLb(i%wZaA%5SOtOf4sDZ8(8=t8a5ABIfS^3`f9&%5iX3n@!NVLYA8c=F=GKF*|JQ^k*A@~{}8 zRc|tr$Gk)t;|% z4l96POJx=2FFa_1dG{ybm)VaQVj`c49EiN07R|VtzV|z%`m3@5`sG&pCmVb_w)ryD z6i+{cxlZA4Fdn6ckJ#Q=ads>m<|O7aY=7w(yr>=0smJlQ!z>_=ediaKjvSdZ{azL> zlY@5+W^2$Li3O4RfRBcS>Oi1&3Irtw>rn+(UtKL=IIM!0PDYSSW<3F{2TYiJCct$i z@54d;r=ZrsJ+dNJg6qqz{;lD^*$LPwHJp;3Aj_02 zfZ;<$1St!j#;HtLQJlSx3*pnh_od7=p5I$)_tRXf(`!Q2yQ?>*^rc2yR+Ve6AuV$Z z`R`W?nyH7w=q+lE-lntJc%6|q8`V~e-lo+HI=jYZS8L4bN$WIOUn=nD4Lh*^xsKoD z-!l6=qd+Uf0O@J@^KV{f^Q`6f_R~D8nUuH4rma|S-eoMwa87m#S0tDa2T!IGxvD0I zU%?RqCl5@fc+Y84hgtp~07zN(Ozdn-#lF2PbI~kwjsS@UXV7kSyNm$asC6by(Eci( zOQiy*R_)*s49xU}%`kWDNU8;%`nm{m=BV`+O<1rQrWoO~C|#ujhso*=0XrvXoN6Hy zsGXAgZWghTsz7ZqT97wL2t_n{)2sN8Q(dU_8+BEQ@O^&cvdOJj2<4SX}{DL&p)q>wIz)kf#hfY}$_1 zESw+*t=2jW5p$rL5)jGgIem1}npWUeaR3cL6dIsJ(^#&ezQy2&mG)~*-+hxhR+2J2 zSjQrDW~0tv&2)^xgVsvtP9%HosKlET+YK)Rj*Y?*YrSppo1F@P2$}DOZ_tNGG=DQI zu~NqW@G^5_mT50{w9Zq@#r$Em`Uhl|H%|d;<V06$G5sly6L?RKPiHZ%?-hhiO1* zXw+#+En7pG0T+x`Hf{bIDAoL}?0t<2eVO3*J1rFOkTN=^$E9Q0tK@8^Kt*f`$ms>3 zEs~9gv31FUNU=!5Dk|P_FMhWJhRDSD6>LVcwN$+Okko+fw28}qNV=PQTPy( z4c+NAmXg6Z6@8DjJyi(HMPBg1gTQw6Se;ax96(Rw)_<0T_`xiEHU0w_yFM5&*4QJ! zk0O~WMEaEV1LHqBmlUd)jJPGk6`9F7E#phGqVTKWY5%bu8=B`7dR8U~p7g z(c3I8C+|(~Jy^fU#xslX$}y)M8kb+IiZpAJ?)SHOi6e>G;4%R4@^c$VsUSPFwm2SBi@= zRGcP+XR#+qwK*JSETp`4*&vMq)2)wA3li5;B7Fpy2^q)s>wkO~eSYnmzi8fHK=Y_L zd*h+PqIJ~NM0cRNlxkbM+vkYbHrrHdO*r1u%tb7gmj{w^dTieCKM%X-vPkb^6$<20s;UQc1aNGgT5(^)y#KFM0E!(2t=I=ys!6wOZDI3)Jc0pKjO9k`7z!GfsT zq29iu*1MiuDI3)FLOoX!O}SF+6;{_5w;)~>_op=~Ti)Cz-;Phz5zWq7ldpoi;3t#qx6y4RB#%z$nvK`>o2|1dqw6n3wHj&N?G`hm)(f=xg*EI8iiei%t_& z6l7*f&pIPn1)5+O&M>6{voIxoa1Iri>`p5dZL9^;5Nxw*N!9|Zoi}1Hl~phU{7W7+ SWV{pzSyI$O!Jdnr`u=|c=)m{@ literal 0 HcmV?d00001 diff --git a/weibo_spider/config_sample.json b/weibo_spider/config_sample.json index 39322039..185f0d1c 100644 --- a/weibo_spider/config_sample.json +++ b/weibo_spider/config_sample.json @@ -16,5 +16,6 @@ "user": "root", "password": "123456", "charset": "utf8mb4" - } + }, + "sqlite_config": "../weibo.db" } diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 9605ce9e..48b00ebb 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -85,14 +85,14 @@ def validate_config(config): sys.exit() # 验证write_mode - write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql'] + write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql', 'sqlite'] if not isinstance(config['write_mode'], list): logger.warning(u'write_mode值应为list类型') sys.exit() for mode in config['write_mode']: if mode not in write_mode: logger.warning( - u'%s为无效模式,请从txt、csv、json、mongo和mysql中挑选一个或多个作为write_mode', + u'%s为无效模式,请从txt、csv、json、mongo和mysql、sqlite中挑选一个或多个作为write_mode', mode) sys.exit() diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index ee9eb43a..48493233 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -24,6 +24,13 @@ flags.DEFINE_string('u', None, 'The user_id we want to input.') flags.DEFINE_string('user_id_list', None, 'The path to user_id_list.txt.') flags.DEFINE_string('output_dir', None, 'The dir path to store results.') + +# flags.DEFINE_string('multiprocess', None, 'multiprocessing.....................') +# flags.DEFINE_string('print-in-debugger-startup', None, 'print-in-debugger-startup.....................') +# flags.DEFINE_string('client', None, 'client.....................') +# flags.DEFINE_string('port', None, 'port.....................') +# flags.DEFINE_string('file', None, 'file.....................') + logging_path = os.path.split( os.path.realpath(__file__))[0] + os.sep + 'logging.conf' logging.config.fileConfig(logging_path) @@ -62,6 +69,8 @@ def __init__(self, config): 'video_download'] # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 self.cookie = {'Cookie': config['cookie']} self.mysql_config = config.get('mysql_config') # MySQL数据库连接配置,可以不填 + + self.sqlite_config = config.get('sqlite_config') self.user_config_file_path = '' user_id_list = config['user_id_list'] @@ -226,7 +235,11 @@ def initialize_info(self, user_config): from .writer import MongoWriter self.writers.append(MongoWriter()) - + if 'sqlite' in self.write_mode: + from .writer import SqliteWriter + + self.writers.append(SqliteWriter(self.sqlite_config)) + self.downloaders = [] if self.pic_download == 1: from .downloader import ImgDownloader diff --git a/weibo_spider/writer/__init__.py b/weibo_spider/writer/__init__.py index 95fc931b..024f9937 100644 --- a/weibo_spider/writer/__init__.py +++ b/weibo_spider/writer/__init__.py @@ -3,5 +3,6 @@ from .mongo_writer import MongoWriter from .mysql_writer import MySqlWriter from .txt_writer import TxtWriter +from .sqlite_writer import SqliteWriter -__all__ = [CsvWriter, TxtWriter, JsonWriter, MongoWriter, MySqlWriter] +__all__ = [CsvWriter, TxtWriter, JsonWriter, MongoWriter, MySqlWriter, SqliteWriter] diff --git a/weibo_spider/writer/sqlite_writer.py b/weibo_spider/writer/sqlite_writer.py new file mode 100644 index 00000000..3eec1f0c --- /dev/null +++ b/weibo_spider/writer/sqlite_writer.py @@ -0,0 +1,109 @@ +import copy +import logging +import sys + +from .writer import Writer + +logger = logging.getLogger('spider.sqlite_writer') + + +class SqliteWriter(Writer): + def __init__(self, sqlite_config): + self.sqlite_config = sqlite_config + + + def _sqlite_create(self, connection, sql): + """创建sqlite数据库或表""" + try: + cursor = connection.cursor() + cursor.execute(sql) + finally: + connection.close() + + def _sqlite_create_table(self, sql): + """创建sqlite表""" + import sqlite3 + connection = sqlite3.connect(self.sqlite_config) + self._sqlite_create(connection, sql) + + def _sqlite_insert(self, table, data_list): + """向sqlite表插入或更新数据""" + import sqlite3 + if len(data_list) > 0: + # We use this to filter out unset values. + data_list = [{k: v + for k, v in data.items() if v is not None} + for data in data_list] + + keys = ', '.join(data_list[0].keys()) + values = ', '.join(['?'] * len(data_list[0])) + connection = sqlite3.connect(self.sqlite_config) + cursor = connection.cursor() + sql = """INSERT OR REPLACE INTO {table}({keys}) VALUES ({values})""".format( + table=table, keys=keys, values=values) + try: + cursor.executemany( + sql, [tuple(data.values()) for data in data_list]) + connection.commit() + except Exception as e: + connection.rollback() + logger.exception(e) + finally: + connection.close() + + def write_weibo(self, weibos): + """将爬取的微博信息写入sqlite数据库""" + # 创建'weibo'表 + create_table = """ + CREATE TABLE IF NOT EXISTS weibo ( + id varchar(10) NOT NULL, + user_id varchar(12), + content varchar(2000), + article_url varchar(200), + original_pictures varchar(3000), + retweet_pictures varchar(3000), + original BOOLEAN NOT NULL DEFAULT 1, + video_url varchar(300), + publish_place varchar(100), + publish_time DATETIME NOT NULL, + publish_tool varchar(30), + up_num INT NOT NULL, + retweet_num INT NOT NULL, + comment_num INT NOT NULL, + PRIMARY KEY (id) + )""" + self._sqlite_create_table(create_table) + # 在'weibo'表中插入或更新微博数据 + weibo_list = [] + info_list = copy.deepcopy(weibos) + for weibo in info_list: + weibo.user_id = self.user.id + weibo_list.append(weibo.__dict__) + self._sqlite_insert('weibo', weibo_list) + logger.info(u'%d条微博写入sqlite数据库完毕', len(weibos)) + + def write_user(self, user): + """将爬取的用户信息写入sqlite数据库""" + self.user = user + + # 创建'user'表 + create_table = """ + CREATE TABLE IF NOT EXISTS user ( + id varchar(20) NOT NULL, + nickname varchar(30), + gender varchar(10), + location varchar(200), + birthday varchar(40), + description varchar(140), + verified_reason varchar(140), + talent varchar(200), + education varchar(200), + work varchar(200), + weibo_num INT, + following INT, + followers INT, + PRIMARY KEY (id) + )""" + self._sqlite_create_table(create_table) + self._sqlite_insert('user', [user.__dict__]) + logger.info(u'%s信息写入sqlite数据库完毕', user.nickname) From 747980fb2cbe9c688307df4f1e33d0458f978d1d Mon Sep 17 00:00:00 2001 From: liandy Date: Fri, 4 Sep 2020 20:47:30 +0800 Subject: [PATCH 273/363] Update spider.py --- weibo_spider/spider.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 48493233..2408c5d4 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -25,12 +25,6 @@ flags.DEFINE_string('user_id_list', None, 'The path to user_id_list.txt.') flags.DEFINE_string('output_dir', None, 'The dir path to store results.') -# flags.DEFINE_string('multiprocess', None, 'multiprocessing.....................') -# flags.DEFINE_string('print-in-debugger-startup', None, 'print-in-debugger-startup.....................') -# flags.DEFINE_string('client', None, 'client.....................') -# flags.DEFINE_string('port', None, 'port.....................') -# flags.DEFINE_string('file', None, 'file.....................') - logging_path = os.path.split( os.path.realpath(__file__))[0] + os.sep + 'logging.conf' logging.config.fileConfig(logging_path) From e372eca1dc1768d1b9e5a6a03458cfb861fc32f6 Mon Sep 17 00:00:00 2001 From: mingkai Date: Fri, 4 Sep 2020 20:48:40 +0800 Subject: [PATCH 274/363] Delete weibo.db --- weibo.db | Bin 61440 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 weibo.db diff --git a/weibo.db b/weibo.db deleted file mode 100644 index dae64a83e5166c38c38fa5ffcdd8ed4f2fd467b4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 61440 zcmeIb32(@d`tDga@CX{msM433R;||;0HYSp=!(4U8*frm6hM0_FyDr3m(XRTKI7KLl&Pm zZ1aZGRSF!(&G48h74wu*DIpB9$!jYgYlzv`4bJamxIBK(n z=lp!?1*&yhw^o*wY)MlDEkdL#pnu&zB*hdZt!H{0F>|8ddog4DCyYH$}JHPg;C6 zH5lVL)VXa(`R0-x`&66C_NhK_SwEb{Qy*@NP{4!L)&Wx_?32p} zWS%|1%l}(^aP9mhZ@=}{{8o$1=fsZmZ^0xkr#rJVxiBk}dtDZ{*MtE`&50Qa>ad}? z!SpkjH`9m5w%!n6v}%0%uk0+o+-0+h9=sqkvVlG>I=Hl-I%$zY(Nzjn@%&k&*Sk$U0w(A>dQX1 z1w%7`oSWOeyI}F#KX~h9+$IY+o_;TcT3Je}g%P$c?#qykf%VN0(lDLDUhc?o&tFa z_2@@u7YLwN4SC1bNDm=lczwQ0(lDL zDUhc?o&tFaH%uQv#SuyWzM^E0K_hw5jS#iN(Ro)ZgT=-p=aPC*^xX5#6S&URCav&i0Ad zy97Rd@>$|qOZ;X-v1)DN*?@HRgm|kler{ByiXXj=u8J>O#pey;cr*S+2gU2RKOeiO z%6?LQs9)?nAMbfA9lwwMei|P=hz^S_4|L-5%Rin@fGD4X08R(zt~T5B)q>g@)d~?! zAV>uGeu{!~slXf4n5Y7WL#y*exVp&|XmYn8#M|ne7y@sdU^DRU`pFdtxhsfhg0(hO zpmFfdkR}kATtO}-01fvj|$(+Fp<;d*SL2$bQvbXY)A||mU`R7!9 zlTJPrdq+NNxPh5EF?ugCewL>2kv23D%@v> zn;1DXiJOft;pQA8>EZ=#K&?|pss%2>Asj90woD$$9Lz`+_ygg(aH;~XZVEHbp@M)p zW~xn9pwY+zB{QGq(gkwCl{(sTP=QQ$}Gf71Y?PO{o@mHT>ijo6(h8xnbl18)73hA-^oWn16R+PC`=2JR^ocDk z&?^%oJy5gBtH%<}N5t-hSfZ0i({j!q1nrYSoydRpwcD7FqJo32q6qgN7RV{~Wy5Y^jzN$ee*7;2B-KPFzd zkT`Kr8hoshE)GcdN5$*sQ5jX=d^qzoA5OI16AvAS=tAAn>A^(%m`ZGUD)wI# z`<~J@{8V~)1rjfQV-%WF10$ACSA29#I{88+tWI=viN}wy z&XDwE9q73{F*E{NNcXNxjC4q+T2zUvccs=l)U%O8;ESu)`DK-Ky;Hh;EZJ~bx_Cw<_P0vy*OQ~i#7jq2>eUlty?77a znHWR8BWT^k=(WU^TdH{1WvQc0ZrB5SkQncdpS%^nc}%>21MQa{ox(>_?}&7DP&{&4 zYU{w*s4$F)n+K&^SK>E2>7`fi<5DHhT}O)(&12&AOUbhj(I^%Bgh}8;M`Pkd8;!*P zY>H78`kZ+3jMacIC5}E5yH1N;1BshWcql&FB|SMRoobEuw9}&#LoX(Vu8S|usEV~x zZ!20JKQ|`czgDb*K7pPQyN^M9$6uhwXK{&rZSf=h)Jt^f@MG!JJ(b8PJ$f2CLGJ!p zjIIi*DA9i#b>Obl221B4wXk=7NP# zhD|$Zwl(SBT=fx6H~S_}*k8J9OJ=$y8V`xx_oRnyAV+$Beb&$`oxSu~!v(SZ7UK{Y zvKZmwjV@o%;qs|Ukon8&DrSQtwRX$Q2~wPjx5|36_e^47G||xV^MW&IAQq`(Vw49wmee^BkO+pP6+0)$?sZS=Zm-_s}XMh zk(_TJ^byhs)Y*a`*Ek_?GLP0*7TIU5-@Vo3-k{mFd&mAQW!iPycJYSI zpG0H!(z590(yFMvv}7xX|K2Z)`CZ7O@G-A78hMV_8Z_^RLN3S0d&>gq@;bM9ujBnq z<)t4-ysl8#7qop}esg*(46ArpBZf6i8UR2ux!(3s}~SiqvDM)N2d{a@w_MVdo1%UA2eC#6VET4#Pg25<(~RI z!oCcV(`&Qj;ZBm;{nrx%@(dp80H2fPd3InbtBWnSFxg2O-hx6JAtiAC$V}|5`upMK z{Z_SgtIbnYw~O0RvDy79ysT!ttgfuxZP;%%MJi|IWvru@mg3j4HTboh$FCLN#jk?z z;8)St@oUL4+SIRH@u%|&I||Pg-dypgEBw^#mnrF&)F%EK$)Td7`IvU2OnYbzhDJhk$VR{r~y<13RzuNA$y^8YLP zsOa6IkBc@H?JIH=MT(k>jurj8BC+T%iY^u1NAvPOc?#qykf%VN0(lDLDUhc?o&tFa z$zN;S@4rN;9^ zRZ8deauRp%AJDGv-P*-Qmt+a=h-0+rHYSzf5sHu$8eB!VhS*gA{ zV$$nv2DudD&Dv_sZnd73%xa5u)6Oa#E7|I;(T!z6tx{UAcI_368dloBJ62s;woWaV z>Kv6j4T7~sU?rDV?b2;1=UFMZIb35BDml4SUsJu`SXr{^Bc)Vct<`wySIec6+C63U zTHoGPa;em*SKIYfTfQ%s%GO6MA)T$}dsM36T|%ikY7YFCToSm=5nniL{E(H_Z7Yjy z^+!KorS<-r4gLz@yK+e#v)AbOlKtVij{)pL67>APu^jrvZ|foicOL4u+p}T zZcW|h=(kyEuhCMwyF~Nrth6s&6|1P;`YpMnt#kXGF^l2XSZU+7YM**n-Pc%Y?MRv}$WO-RrDWWwL5)>vctP$*4ByIrsk1N>{@JfR!R!DmQG4cvrAeEbJ|-4|tZ#rPP-6GFDQya+k`b)SlcD zR#G;>7E>v`J@A_RS;_^!NG_#3gbP_oaZoKNSoqGeij*g%5OQSMy#F|F<+(!5ioT`# zC7p}TF8tg1PjTxX#7}`z{QmzW-+O-lKfnK<-~Z3=|L6Dr^ZWlm z0_6Aq^ZWn#{r}k@xsw3|`Tc)7J7X3s%U8Dlze?a`EDJXsX0c@1f?r&kU$}|Z|I6p~ z&0G1M0_XAv%g!!&viQHh=37)g|F3cLf0m!7ZKc41G{2hyMiPj>JfQd*GhpCEV|y|1 z?$WvQ#VUY5iW5&R0zMYM(Nzp^2Z6?(T%=olClgN|i@mo1CXA1a6Zq@MsMvl*eAWlZ z9Dq&)^k+EY7d?rN8$g78{=5;`5<)Wp84uuK-vtJ1Og@(}eSkQ2x6z%}+k~y^8v~>+ zdEp`|NgTZk{1qMp{)*rz16>nCcK|s7#F0>Q!;i%NHnFi!9C?OU%!K940W~%2qk0?8 z#0daw9YU&YX`TiCwR@0R0P30GS*aa*k(vt zEVZ8z509ev;>C_c(@4DOC_~PHeu*nbfGI0h0n2nt8axKXcYN%2>Lw!)(X0PFf%VW; z*4usp1X6DSs*@f+oV4(ISs1~rpkuYeY&Hh{AueDg$dQk$F@#;}$(W^C$;c`Qxb)Qk z@hcU$!;>qR6}7BXVAk8ssR|si>d6(%fSSsLg0KoAdVioURe|0$B?@2;Er?iLgj!}@ z2-v5fmgiD|HE8stD&RFT@Ola>=_wdRt7U^%vTfVq^Iro)tr##g6|W*4DY z!x!Q`LlYwpL7rv^kxu*uY8c`}I(nRaLgH!cvZ~Z(3a$cBbz*2N-h5wtMlfgb1;y=5 z1z`e%Xy1w8H&kqyg6!JU*(B$`jRcRk^QmCtsKv3c$|0>s9zu9%1~Ne z**m2^GE%-676L`DD zso5!62y< zBX^TmhT}c`6dje8VABG>a><)WwO6OL$+O)Z;at9e-bLbKGXD?|l|aGixwd5c4;3!@ z-5=3hi^^Q|p29_`U9(KXi`lp%s14CE#l?(`CWd;&&Z83}$EAknV5Xr9Vs{UlQ&Pjx ziQ!g2gT*fJ+YP|aF*VU`!0Q3q2hU;R0Ol?pyNc&2%BOFPKD~QZJj8BZ%8H^JY6CzX zpnYKVrHiLzDEp1#FM&B@k$Qet^)9o_oV)8d%Q{DSiaD$K8r!dcIdzqr0#TRAv0>Y6 zU`&C4#T4kB z#>dh`#@J9a)`bT4TO53l>>Gy-Ljg~HV~EufyPM;UcatYu#8d4Mf5l>d%fxUCp1>ER zq1)1ba3R%!#X|kFE*>bg9NSB*h3+o2P&J_MY%v7@0J7irp*8f zS`%=0@n=UB-q(Cggn4H&?<>2)8N%dh^W?dqN?tfVF?1HOC~T5|tk6HwsdE$)akeXd z;Ywnl;j@O5xG3BM;UitA6V1I}0zdQXt9I}<&3>7m_v(Y&eVaF@`Po&rCl@~()nA^l z<(aV6xpDm6R*SVfho}XH8=_XLn~k3rUwNSnoBFqD*uFVQ@kNyF`>bJ8=-=I(Jk$Y^ zB`YpPfT=J%(1X(8NeVrbe}lWluCsVlywD@w?**}n#>o>_`wy)go zH##?^#?0>JV!6ib%T$ocm^t4QHt4+;wK}-TnnMa{IOqy&d zm_^p8Wy}@ChKMbLi4IsPu2T0=+ue^YqwI zXJY&?<|ghZ2A(0_I@y4@ppLI#jQ47FF`btCgvNNCZbQhsUmZw|@kW#LE7dc~7;mcB zq}BL8sdbglKEenl!n{<^Ixm-beQbmavBqCE@BMj&zhCe_mjA)BpDw=ontNgU{C|a8 zSwGG1tsy>`Kba2{fK3)3J7`6}LMjXxR{`yPp19qFY8W^KwYq_H?R2FnlYc#7a2 z7^a9DBn#*pW4|$d;ayRTOR@hlvoCYo48Qz3g*Vs4_HPXYl9^Bt1a83`)hT)us6pP)zhHzS{OD3 zcSL3EeqzAb!O5j84cXW!K()-+VW=qdwCimA#ZlSTyMz(IP(W73`y0g0Q@BYnPw|rv z#6vBKh9lzj(<(&4#}7W0dLH9T#dsc4HGZpuykdxoAG|7^9*B=VP7IvEXm5|7c)E6H z)D;TZ(zjJ4qgj-t9Ak=CPza^$&0_75|85W;HnMl1X4RBB z=tKewSSAQyK98vb8W1gq5dPc)vLUpb1Yhf6^3r1M1lRKU)1V^4zlow}o{9}19!S_H zwVjaK`r&Mro;Tr|80p7_VB_SOTdLyhBr2FKsWdFv`BU^SiT2DzR99`awl+|k&i4>? zIwrdv=TL!8XN{^cDlBD%OE_)LhPh3gp^?%joD`G!IBd1KNL&s5_bP7Ika@>6thEh8+hxNlw9j5<(p051uimgd z_{BBvD?5M%#Q|)9%2ltk={DMe^*J=J%pSZj+xoWX&uSRoG_F&_0sN0s^G|jF#}rh5 zhHWrBQf~*;q>9D|tCa2=lnx0-@b&Y_GyTYO^x1_Ivv`AN6@hAp#jnwN0;Yhh8aV{q zJG`<0f~3I3Qbed%@oZIROnZah`1-$9n177`=5KVTw(?}dF%^xF=|wS{?aauJx88wW zac_uSpFbzgqkI_51x&&L;q$rux!F4GWt(@oCVf!n($*SXqyS6-&SchWUM+!^1r_Lo zI-?I2AX$Me%GFNU{hO8k&|%a9w@r_n3B1D-bv(YZCOJcWM`MX%K3G z!6)wrU^59xk8D#~Foy~>VXt5T4V)pm>gD~%DJ+Cca$4l{Ar;iHnlVoyR{6{4zdf(; zS>ehR&gBiux|aUYk`0SCE>z*>|Gqy57xSc<*QXZJvO=j##q?y-2BzKa;_xZTXVf-4 z@d8_z>D>|J-h)p=y#7oY8k!jH7JKfC*L$UNwA><-v-S4G$a!gKKObb`CwxF@9}?3+rAU!{&v{9}7I2e!nv7!01Rvug+*)4)jnv01hVU;o43VDqEt zH#smuQ+Y~7FyX3%b7h1p^6p^FrL#woIFU5O;Va2AwDSsSn`nC|HDiBeB+=dkjh`61 zhctMw0mQC5$gqQtk|%G#-Gz@7-*?LU1(*rxo8}g2;AHYJl31Om=Lb$oFu|TZ$nEI< z0*kazwZ`Bu=_mnRFn}v;b56HpXH5l3ac)^;>N*xb zI!KIt{XBXS@4fW1XIQbq{3MJ>*8od7l=ef-Z7G^bL-_@P`<8k~KyR#dAmr}?lkfo3ayaHvK~zCRV*`3ah*$(&(Ce6j-pI9xK?O#a-iU=9A2#q| zW7s}r88wFrbYWMR7BO`$-XKK%uigR^j1^KrRDsJ;`)Zm0h=?YW$AfIBT)@Zk!RpAX zB?FZ!@Ng#N^28938fJ;@GT9I3Km)T{qrp;A@H_Zu)N7xK{f#pFlaJzSiGRk}zj@JW zk`iuG7fs)LQ_eCA_Gf{e8TL;$G$JJ)))eFnlR+Jr^%9cXVVVyPCB}QP?P`P*Ep2dO z0vHTBRqna2`gucpT21`Cp(7grvTG}Q7o57}*ucvOdxnkUt{dbAuN0BJ6orXkCsrdQxwl8MOJL+IM8Vs>;t>0V& z9t_uUA&1u}Pb0JvSIU+b^eWD{M$4sb`M2IRC_V6gA3fM!*i=|K-Lr8(C;DMIJ;nOv^kDL2H+-Op z{wv~8t9a}d(k>=%oCZHL3eoHW!0cHc`vx2uiJRA8K4A-yhJOYHnL|$7)Q*rEk?UBj z2m!k~9CF&^wZ`P$BA+b{uMT&=Hu~#|ged+kk`QvP-DW^`sGsQ&?yP=F7aNo3x>;U0 z^zaNaPoEaA+)WI0#_wK?-#>{iF?SGrF~7KUtPLrg#Ycn40QvcIa_y2|qH6$uCl7a_ z7(e$SdH4v5=roFA=Na)uBP>mN7zFs`&A{pnW!8vJh>@$H)Wn2!EqNN2jufrq+jaU5kpu&yse~9U~HJceLQjztUK@v~2 zPPs4=Ei%&S#lrPoX9Vn&N&s$}YHy*T@El?XknF8Dj@Gk=9^CK&*y&&-wLX#A zNl>Exvs(J1mGD_Dk6B(vz$?XTcj3{S7`hK%5}!*1!xW7+|1J(Es|2Tsun9Bs|Q zC~F-QPst9%lphf8KOi+Jqg!h$tM;-^suainnWr$slP4P?0SLs5O%A!G>E}r7zalzP z?W_sgcha_0sn@Yjw{@2_weYdTHd%fJ9A8sjQc_wK)LNrCINnHdiBl8Dn-$BY5NrHp z^Zwnu!qFA~Y1v;b{hKAdiwo2+rXAnK{J^GBcG`bI$%B!ZAI0ME5EwXd z^9JdOF08}bvAKin_JAHu`=nSUj@*D;#bzpCpx{Mh%Y`L1G2S0*tUafMpR9FrON%=St#W8zwm&jBOmFq|Bay2-~3D>goGm>v_KJ!FV6*d_6i8+41PEW6#|m}Y)1R0GqEQJIFN>skDA zBeuI?*dYpKh6(%28#r#W&0JGn@0JamI;*|L&#zC3`-&Zg4PRW`zw%xPZ(s&ag~Ow_ zlm&K%X2pTYBT4v7^=y8w#h?EN4DrIPf6@HLZ<4@#D>b)ix zC%cp;x}3dO>g~kNcJ>GGP)e;`(*0{liB4yWbi*!f#2O2854)5kMaSs^W6+Gj{SkP; z=zf}EWI?JtgYiyYxQd4f(R|@D+m<9I=srp~Lb)l#2We1##x3-+wb3ks(u1{}UC)_o zv53P-0=&-4yQ8n>)ln)4+H`(YpoyRYy?xro4u%OKH5vfC(u=_ro?UF(HT_O8#-vf3 z$}=s?+f-7f%$Q7uJ*c*A-Lgw%M4I!C#MKU@XupH`A{~PTg}nI?!(GLdyXg_*-11H5 z*m$rTb=9$2M*to}_|1^`Kc;)N-5XTkHrEPTa$Gsoe9Ymlp5iu|b(DBy^wL!ag`f{o zjaA_F+os#l0@NA5MoTvIihB|xZ)5B?h`fPGHiAaUa+M+PbkHI)q-#pxi--NpY-aE; zlAI^69cN+>T*=To41$7BX$WZ7ob-XJIQ8Xhvx7!sS{O+#fo`;DJTZe>=RnMt3zygL zSnY7x(Men>O-^0cvXfsH|5i1+zWC6+=D@q?`ax4_wlmIBQhH^&zLpLH=ou4l9Zp_) z0(cli`BPdD^pk^#-~nS7VawxV^jOCmit!gA0MS_-33}( z@zdn(Z^r$^(-vCH%bV|{%16go#gG!@BnAW@i-eY20PfJ+h5$kY2I`)4dJL_?*VEoC z+@y#F2E>t<4XidSInWe_lp{eQH!gntdAv^gTXJ|H?5MyP3JMgehzgLp-!g5dPrC}Ib>KuD zjv_vntM@W?Z{DK9n6pC!$OA!JS``{4aa(50U!ufGagQ8rwk8z4PHwtf7w-` zxDrxM1o{Yz1-XcMjd#j0o;xrw0A!URH;sySeY`o@K%hOE?DBazXab}GWCgfarTS&o z|9oV-!&EM>|2Ni{f+gE_rVK*YKF<8bx$BiZUYwp8gpL|tbXP=M;o)ar|EmE?g+VCC zfq%B0F;O8_`OD{@omcq(7L=~oviQx{ezM54@acm8w%{yo=6`-=6lnbEcgQ&LrHm8V z9@i+TG&7TXigBW>yV#0WdUho_N{8DJ(Dm9mpb~J}+1UXSzr!sw#jXt|FFm5kEnD+5 zO5?OO{c5%eY+tM3)N2Iz83dlE)8yRtX=a&@VA2$-k`+$=$jSKfz_*LudG~uH+Hh#& zd$OuhGKXe5Pnrha!S-8H11V=__M|Nz)DAy90%*DH=iWZ_ckBg?YknQB-bR!1iemW{ zN;=ZaD-`;AT(vg&>_owU?KzN)oq-4)wszVQlfd%KrKmjg9N7F098Fc?_pkb4wwzv1Zl%*=GM~@DICU&RkY8w_^LpI2Tz09#Iiy0jvd+cncxpdClRhd{E3EeAzF&s5AwMqv^h$$fZ;;3v-ZmEkTdadr#ar{c4V zM0t3OZ=Yk8kUVtzVy$!7Qs~X(oIWKzptnCTSJS^JAo@oEG0 z+8Um6XUJQSINf!!O`U5ANflIETZ);$X^MzbTnTgN0&nqG6OL9M1#8U|2k9KTAo!hXoIS}Za0_ybN`{EJ z87U|jczD>A3N%JpN@gm^&B{Tw&Znotv*a#>T~m(lnnMK|m)@P;&)1uxlSg3=G|&Xy z_B0=uoKtW>bErV8F;*K=U9i?oS>ev50++XrR=6?^oI2y=F65>`pw3&Pcc&@{c&5mM zIkZ4s>ocZD!EKgXzzhuLSJBD)$C@>A9Kr{V4-{n=dRvzFS@4aDRCe-=*nu% zgH$|=4px%dROpm6?3r%A#IrCl+JJBfIL6XykWP;z2hw%1Z9w+V(90p|?^}h08oguZPtJ_((^NTO(Ue(91HR{-s4&e?P_ifm_*UQ%kvn}ethH@4L z&N!Qw|Fy?&Vx*QX`_93lw=hx%Xa1unUNA(&ofUSSzR*NhP3v`2|8%C z_j&SEFVuScNG~q-DHJ~&MHxeOn8p%OEG>_KDyblbYC*)q9v~!5=T*RE z;9mCoLbha!eA?@^mQN>M9Qv^;ew*a|T9egfHHEx3A6#8`21Uk z#LY%xm{Zr*X1;m77tV%hu9O@LI+KqxA(+FC&1K53$_sAew9szQK;FS;e)Sdq7&KTE zUET721u_2mNkN%D<;blJ4JI?biegG-j~&}wBb@&6BUx>%Bt=*U<6GPb7q3~>} zs~g#DlV@mpAWZ=rU-%7I7uXpyiRGV#ZA6k9Hwm^d48~?=kYLoB+m>u>d3wJv&)&~%gX{lbF8*Z!C(!q_c@WofFJ~B zi_=>B3dSLa z-ZN#0NdeH^3{-N7#o!?yx3T_3kXdB2f)U3rvZDJ zjY>Y6qqoR8%s5A@TJi!fFk3W zf0!bE3^A_0ygq^ zd=v?3&&1nWCq^kFwAgqasSk>iW8K(p5IegVg_368o`LUX)*AU<}7b}OD;BEiD|uLJ_F{RwZlTib@8HGR-vWEAB}{zJ7<-S@-iP<7)@Hvoaj{dpa~w=X z4~{o~5p7a$H`jz}^pJFv-$Jdi8Dz)Ql!1XZ!DOOIy6~TuVqgmXyVSgT18I{V(PmXT zzASYRW7fdToFe48jA*9h1vGn7;iXEnsjO@j=p~pKIVcbdC1+nljsWR?AJfAO&m`Tt zg01$P(Zz^Go3SB?ON)=TQiRLZg9siVBMnoBAqo+uF!B5nK@kpjLItt!$D78a#tu*# zwD@u((VP^CrZ@09Q_ZwUG_{)fU^Kyf8cihX)>&pZfdrU9*k+{tp2dIrPuWare7}r# z7iGH_86AVc##A-#f41@0S0IWoMRFy!OWne;dX8&;JAzX#V!^V%9f)Sa^_a zgUiu?X0A5uwQSpz;UXxrHYdJJ1>0GA*VQM2LZh+Dvp_qiC#wBZzKI!~z0tPqAQUM({j9A#jUAEka|m)ds2Y-dl=?a(KimEEp|Obnt33uk~e_E#w(6?A#XvV^HxWlRcmLw|0M)4a-7D- zRYbSa;ybowe;}l zPF;n?1de85OkldTQ(ix(h2*UAfOqKYv2CLb(i%wZaA%5SOtOf4sDZ8(8=t8a5ABIfS^3`f9&%5iX3n@!NVLYA8c=F=GKF*|JQ^k*A@~{}8 zRc|tr$Gk)t;|% z4l96POJx=2FFa_1dG{ybm)VaQVj`c49EiN07R|VtzV|z%`m3@5`sG&pCmVb_w)ryD z6i+{cxlZA4Fdn6ckJ#Q=ads>m<|O7aY=7w(yr>=0smJlQ!z>_=ediaKjvSdZ{azL> zlY@5+W^2$Li3O4RfRBcS>Oi1&3Irtw>rn+(UtKL=IIM!0PDYSSW<3F{2TYiJCct$i z@54d;r=ZrsJ+dNJg6qqz{;lD^*$LPwHJp;3Aj_02 zfZ;<$1St!j#;HtLQJlSx3*pnh_od7=p5I$)_tRXf(`!Q2yQ?>*^rc2yR+Ve6AuV$Z z`R`W?nyH7w=q+lE-lntJc%6|q8`V~e-lo+HI=jYZS8L4bN$WIOUn=nD4Lh*^xsKoD z-!l6=qd+Uf0O@J@^KV{f^Q`6f_R~D8nUuH4rma|S-eoMwa87m#S0tDa2T!IGxvD0I zU%?RqCl5@fc+Y84hgtp~07zN(Ozdn-#lF2PbI~kwjsS@UXV7kSyNm$asC6by(Eci( zOQiy*R_)*s49xU}%`kWDNU8;%`nm{m=BV`+O<1rQrWoO~C|#ujhso*=0XrvXoN6Hy zsGXAgZWghTsz7ZqT97wL2t_n{)2sN8Q(dU_8+BEQ@O^&cvdOJj2<4SX}{DL&p)q>wIz)kf#hfY}$_1 zESw+*t=2jW5p$rL5)jGgIem1}npWUeaR3cL6dIsJ(^#&ezQy2&mG)~*-+hxhR+2J2 zSjQrDW~0tv&2)^xgVsvtP9%HosKlET+YK)Rj*Y?*YrSppo1F@P2$}DOZ_tNGG=DQI zu~NqW@G^5_mT50{w9Zq@#r$Em`Uhl|H%|d;<V06$G5sly6L?RKPiHZ%?-hhiO1* zXw+#+En7pG0T+x`Hf{bIDAoL}?0t<2eVO3*J1rFOkTN=^$E9Q0tK@8^Kt*f`$ms>3 zEs~9gv31FUNU=!5Dk|P_FMhWJhRDSD6>LVcwN$+Okko+fw28}qNV=PQTPy( z4c+NAmXg6Z6@8DjJyi(HMPBg1gTQw6Se;ax96(Rw)_<0T_`xiEHU0w_yFM5&*4QJ! zk0O~WMEaEV1LHqBmlUd)jJPGk6`9F7E#phGqVTKWY5%bu8=B`7dR8U~p7g z(c3I8C+|(~Jy^fU#xslX$}y)M8kb+IiZpAJ?)SHOi6e>G;4%R4@^c$VsUSPFwm2SBi@= zRGcP+XR#+qwK*JSETp`4*&vMq)2)wA3li5;B7Fpy2^q)s>wkO~eSYnmzi8fHK=Y_L zd*h+PqIJ~NM0cRNlxkbM+vkYbHrrHdO*r1u%tb7gmj{w^dTieCKM%X-vPkb^6$<20s;UQc1aNGgT5(^)y#KFM0E!(2t=I=ys!6wOZDI3)Jc0pKjO9k`7z!GfsT zq29iu*1MiuDI3)FLOoX!O}SF+6;{_5w;)~>_op=~Ti)Cz-;Phz5zWq7ldpoi;3t#qx6y4RB#%z$nvK`>o2|1dqw6n3wHj&N?G`hm)(f=xg*EI8iiei%t_& z6l7*f&pIPn1)5+O&M>6{voIxoa1Iri>`p5dZL9^;5Nxw*N!9|Zoi}1Hl~phU{7W7+ SWV{pzSyI$O!Jdnr`u=|c=)m{@ From 9ed19efbdc4d595d80c863738afd6aa0d13d5805 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Fri, 4 Sep 2020 22:10:30 +0800 Subject: [PATCH 275/363] Update contributors.md --- docs/contributors.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/contributors.md b/docs/contributors.md index 40082f1e..12651fd6 100644 --- a/docs/contributors.md +++ b/docs/contributors.md @@ -11,8 +11,8 @@ ## 代码贡献者 -|[codermino](https://github.com/codermino) |[duangan1](https://github.com/duangan1) | -| - | - | +|[codermino](https://github.com/codermino) |[duangan1](https://github.com/duangan1) | [MKSP2015](https://github.com/MKSP2015) | +| - | - | - | ## 优质issue提出者 From 65380cc890fa088c04abfc6a73436e30a7e86a15 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sat, 12 Sep 2020 12:31:11 +0800 Subject: [PATCH 276/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E9=95=BF=E5=BE=AE=E5=8D=9A=E5=A4=B1=E8=B4=A5=E6=97=B6?= =?UTF-8?q?=E6=AD=A3=E6=96=87=E5=8F=98=E6=88=90=E7=BD=91=E7=BB=9C=E5=87=BA?= =?UTF-8?q?=E9=94=99=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #218 --- weibo_spider/parser/comment_parser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/parser/comment_parser.py index af6aa1e8..8e10fa3c 100644 --- a/weibo_spider/parser/comment_parser.py +++ b/weibo_spider/parser/comment_parser.py @@ -30,7 +30,6 @@ def get_long_weibo(self): sleep(random.randint(6, 10)) except Exception: logger.exception(u'网络出错') - return u'网络出错' def get_long_retweet(self): """获取长转发微博""" From 828f5078b01ddb918fde11eea6e01c04dfc9f2a2 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 15 Sep 2020 18:24:33 +0800 Subject: [PATCH 277/363] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3d5da8f9..530884d5 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ - 写入**json文件**(可选) - 写入**MySQL数据库**(可选) - 写入**MongoDB数据库**(可选) +- 写入**SQLite数据库**(可选) - 下载用户**原创**微博中的原始**图片**(可选) - 下载用户**转发**微博中的原始**图片**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) - 下载用户**原创**微博中的**视频**(可选) From 026662a58ae06c7cab75008a3742caf14117b975 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 15 Sep 2020 18:34:52 +0800 Subject: [PATCH 278/363] Update settings.md --- docs/settings.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/settings.md b/docs/settings.md index 37bf2aab..4e772315 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -23,7 +23,8 @@ $ python3 -m weibo_spider "user": "root", "password": "123456", "charset": "utf8mb4" - } + }, + "sqlite_config": "../weibo.db" } ``` 下面讲解每个参数的含义与设置方法。
@@ -90,7 +91,9 @@ video_download控制是否下载微博中的视频,值为1代表下载,值 **设置cookie**
请按照[如何获取cookie](https://github.com/dataabc/weiboSpider/blob/master/docs/cookie.md),获取cookie,然后将“your cookie”替换成真实的cookie值。
**设置mysql_config(可选)**
-mysql_config控制mysql参数配置。如果你不需要将结果信息写入mysql,这个参数可以忽略,即删除或保留都无所谓;如果你需要写入mysql且config.json文件中mysql_config的配置与你的mysql配置不一样,请将该值改成你自己mysql中的参数配置。 +mysql_config控制mysql参数配置。如果你不需要将结果信息写入mysql,这个参数可以忽略,即删除或保留都无所谓;如果你需要写入mysql且config.json文件中mysql_config的配置与你的mysql配置不一样,请将该值改成你自己mysql中的参数配置。
+**设置sqlite_config(可选)**
+sqlite_config控制SQLite参数配置,代表SQLite数据库的保存路径,可根据自己需求修改。 ## 设置数据库(可选) 本部分是可选部分,如果不需要将爬取信息写入数据库,可跳过这一步。本程序目前支持MySQL数据库和MongoDB数据库,如果你需要写入其它数据库,可以参考这两个数据库的写法自己编写。
From ec4e7a1383c00b613d5af8141866d5748ea0800e Mon Sep 17 00:00:00 2001 From: dataabc Date: Tue, 15 Sep 2020 22:04:22 +0800 Subject: [PATCH 279/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8Dsqlite?= =?UTF-8?q?=E9=BB=98=E8=AE=A4=E8=B7=AF=E5=BE=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setup.py | 4 ++-- weibo_spider/config_sample.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 456b9b27..7cdc4b9f 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='weibo-spider', - version='0.0.8', + version='0.1.1', author='Chen Lei', author_email='chillychen1991@gmail.com', description='新浪微博爬虫,用python爬取新浪微博数据。', @@ -13,7 +13,7 @@ long_description_content_type='text/markdown', url='https://github.com/dataabc/weiboSpider', packages=setuptools.find_packages(), - package_data={'weibo_spider': ['config_sample.json']}, + package_data={'weibo_spider': ['config_sample.json', 'logging.conf']}, classifiers=[ 'Programming Language :: Python :: 3', 'Operating System :: OS Independent', diff --git a/weibo_spider/config_sample.json b/weibo_spider/config_sample.json index 185f0d1c..f5e9f764 100644 --- a/weibo_spider/config_sample.json +++ b/weibo_spider/config_sample.json @@ -17,5 +17,5 @@ "password": "123456", "charset": "utf8mb4" }, - "sqlite_config": "../weibo.db" + "sqlite_config": "weibo.db" } From b8399df4971efe5567a13f519a310aba6e13617b Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 15 Sep 2020 22:08:04 +0800 Subject: [PATCH 280/363] Update settings.md --- docs/settings.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/settings.md b/docs/settings.md index 4e772315..3ae83be9 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -24,7 +24,7 @@ $ python3 -m weibo_spider "password": "123456", "charset": "utf8mb4" }, - "sqlite_config": "../weibo.db" + "sqlite_config": "weibo.db" } ``` 下面讲解每个参数的含义与设置方法。
@@ -71,7 +71,7 @@ random_wait_seconds值是一个长度为2的整数列表,代表每次暂停sle **设置global_wait**
global_wait控制全局等待时间,默认值为[[1000, 3600], [500, 2000]],代表获取1000页微博,程序一次性暂停3600秒;之后获取500页微博,程序再一次性暂停2000秒;之后如果再获取1000页微博,程序一次性暂停3600秒,以此类推。默认的只有前面的两个全局等待时间([1000, 3600]和[500, 2000]),可以设置多个,如值可以为[[1000, 3600], [500, 3000], [700, 3600]],程序会根据配置依次等待对应时间,如果配置全部被使用,程序会从第一个配置开始,依次使用,循环往复。
**设置write_mode**
-write_mode控制结果文件格式,取值范围是csv、txt、json、mongo和mysql,分别代表将结果文件写入csv、txt、json、MongoDB和MySQL数据库。write_mode可以同时包含这些取值中的一个或几个,如: +write_mode控制结果文件格式,取值范围是csv、txt、json、mongo、mysql和sqlite,分别代表将结果文件写入csv、txt、json、MongoDB、MySQL和SQLite数据库。write_mode可以同时包含这些取值中的一个或几个,如: ``` "write_mode": ["csv", "txt"], ``` From 5714fe74dd888440f7703443ac187a6f67b65d54 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sun, 27 Sep 2020 21:06:45 +0800 Subject: [PATCH 281/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E4=BF=A1?= =?UTF-8?q?=E6=81=AF=E8=BF=87=E9=95=BF=E6=97=B6=E6=97=A0=E6=B3=95=E5=86=99?= =?UTF-8?q?=E5=85=A5=E6=95=B0=E6=8D=AE=E5=BA=93=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/spider.py | 6 +++--- weibo_spider/writer/mysql_writer.py | 2 +- weibo_spider/writer/sqlite_writer.py | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 2408c5d4..dde659c6 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -63,7 +63,7 @@ def __init__(self, config): 'video_download'] # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 self.cookie = {'Cookie': config['cookie']} self.mysql_config = config.get('mysql_config') # MySQL数据库连接配置,可以不填 - + self.sqlite_config = config.get('sqlite_config') self.user_config_file_path = '' @@ -231,9 +231,9 @@ def initialize_info(self, user_config): self.writers.append(MongoWriter()) if 'sqlite' in self.write_mode: from .writer import SqliteWriter - + self.writers.append(SqliteWriter(self.sqlite_config)) - + self.downloaders = [] if self.pic_download == 1: from .downloader import ImgDownloader diff --git a/weibo_spider/writer/mysql_writer.py b/weibo_spider/writer/mysql_writer.py index d395b123..f731c464 100644 --- a/weibo_spider/writer/mysql_writer.py +++ b/weibo_spider/writer/mysql_writer.py @@ -121,7 +121,7 @@ def write_user(self, user): gender varchar(10), location varchar(200), birthday varchar(40), - description varchar(140), + description varchar(400), verified_reason varchar(140), talent varchar(200), education varchar(200), diff --git a/weibo_spider/writer/sqlite_writer.py b/weibo_spider/writer/sqlite_writer.py index 3eec1f0c..cea0ccd9 100644 --- a/weibo_spider/writer/sqlite_writer.py +++ b/weibo_spider/writer/sqlite_writer.py @@ -11,7 +11,6 @@ class SqliteWriter(Writer): def __init__(self, sqlite_config): self.sqlite_config = sqlite_config - def _sqlite_create(self, connection, sql): """创建sqlite数据库或表""" try: @@ -94,7 +93,7 @@ def write_user(self, user): gender varchar(10), location varchar(200), birthday varchar(40), - description varchar(140), + description varchar(400), verified_reason varchar(140), talent varchar(200), education varchar(200), From d1973b0df5629db9c28b200dcd3b14ea66fb483b Mon Sep 17 00:00:00 2001 From: amatuerCoder <17560146+bluerthanever@users.noreply.github.com> Date: Fri, 9 Oct 2020 10:07:30 +0800 Subject: [PATCH 282/363] Update page_parser.py --- weibo_spider/parser/page_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index c7c1083a..0ff0be93 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -57,12 +57,12 @@ def get_one_page(self, weibo_id_list): if self.is_pinned_weibo(info[i]): continue else: - return weibos, weibo_id_list + return weibos, weibo_id_list, False logger.info(weibo) logger.info('-' * 100) weibos.append(weibo) weibo_id_list.append(weibo.id) - return weibos, weibo_id_list + return weibos, weibo_id_list, True except Exception as e: logger.exception(e) From 3b57d8e862a56d7f9772a953ac48f330bb1c0842 Mon Sep 17 00:00:00 2001 From: amatuerCoder <17560146+bluerthanever@users.noreply.github.com> Date: Fri, 9 Oct 2020 10:08:59 +0800 Subject: [PATCH 283/363] Update spider.py --- weibo_spider/spider.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index dde659c6..4ef86533 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -142,7 +142,7 @@ def get_weibo_info(self): page1 = 0 random_pages = random.randint(*self.random_wait_pages) for page in tqdm(range(1, page_num + 1), desc='Progress'): - weibos, self.weibo_id_list = PageParser( + weibos, self.weibo_id_list, to_continue = PageParser( self.cookie, self.user_config, page, self.filter).get_one_page( self.weibo_id_list) # 获取第page页的全部微博 @@ -155,7 +155,7 @@ def get_weibo_info(self): '-' * 30, ) self.page_count += 1 - if weibos: + if to_continue: yield weibos else: return weibos From 89e15fa9414203023cb64d44a9f884aefd3218d7 Mon Sep 17 00:00:00 2001 From: amatuerCoder <17560146+bluerthanever@users.noreply.github.com> Date: Fri, 9 Oct 2020 10:14:01 +0800 Subject: [PATCH 284/363] Update test_page_parser.py --- tests/test_parser/test_page_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_parser/test_page_parser.py b/tests/test_parser/test_page_parser.py index 4eb637ab..9de978c2 100644 --- a/tests/test_parser/test_page_parser.py +++ b/tests/test_parser/test_page_parser.py @@ -16,7 +16,7 @@ def test_page_parser(): user_config=user_config, page=2, filter=True) - weibos, weibo_id_list = page_parser.get_one_page([]) + weibos, weibo_id_list, to_continue = page_parser.get_one_page([]) assert (weibo_id_list == ['J4PGk4yMw', 'J4EUStJKu']) assert (len(weibos) == 2) assert (str(weibos[0]) == """生日动态 \xa0\n""" From c7881f8e8afbedbceb507687cd9b727c76773694 Mon Sep 17 00:00:00 2001 From: codingc Date: Sun, 11 Oct 2020 00:34:01 +0800 Subject: [PATCH 285/363] add function: bid2mid --- weibo_spider/parser/util.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py index 55f61372..b05efe1a 100644 --- a/weibo_spider/parser/util.py +++ b/weibo_spider/parser/util.py @@ -51,3 +51,30 @@ def handle_garbled(info): return info except Exception as e: logger.exception(e) + + +def bid2mid(bid): + """convert string bid to string mid""" + alphabet = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' + base = len(alphabet) + bidlen = len(bid) + head = bidlen % 4 + digit = int((bidlen-head)/4) + dlist = [bid[0:head]] + for d in range(1,digit+1): + dlist.append(bid[head:head+d*4]) + head += 4 + mid = '' + for d in dlist: + num = 0 + idx = 0 + strlen = len(d) + for char in d: + power = (strlen - (idx + 1)) + num += alphabet.index(char) * (base ** power) + idx += 1 + strnum = str(num) + while (len(d) == 4 and len(strnum) < 7): + strnum = '0' + strnum + mid += strnum + return mid From cf3cb79a5d3e260d948361f51f9c6ef5f0e32e50 Mon Sep 17 00:00:00 2001 From: dataabc Date: Wed, 14 Oct 2020 18:20:06 +0800 Subject: [PATCH 286/363] =?UTF-8?q?fix:=20=E4=BF=AE=E6=94=B9=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E4=BF=9D=E5=AD=98=E6=9C=80=E5=90=8E=E4=B8=80=E9=A1=B5?= =?UTF-8?q?=E5=BE=AE=E5=8D=9A=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #240 --- weibo_spider/spider.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 4ef86533..9c26e1e4 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -155,10 +155,10 @@ def get_weibo_info(self): '-' * 30, ) self.page_count += 1 - if to_continue: + if weibos: yield weibos - else: - return weibos + if not to_continue: + break # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限 # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默 From 190f4610a111d06fbeaba97ed4995b93a1358b27 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sat, 17 Oct 2020 19:39:56 +0800 Subject: [PATCH 287/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=9B=A0?= =?UTF-8?q?=E6=B2=A1=E6=9C=89=E6=9F=90=E4=BA=9B=E8=A7=86=E9=A2=91=E6=B5=8F?= =?UTF-8?q?=E8=A7=88=E6=9D=83=E9=99=90=E8=80=8C=E5=87=BA=E9=94=99=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setup.py | 2 +- weibo_spider/parser/page_parser.py | 21 +++++++++++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 7cdc4b9f..30873b5a 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='weibo-spider', - version='0.1.1', + version='0.1.4', author='Chen Lei', author_email='chillychen1991@gmail.com', description='新浪微博爬虫,用python爬取新浪微博数据。', diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index 0ff0be93..ccfb273c 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -1,3 +1,4 @@ +import json import logging import re import sys @@ -271,14 +272,18 @@ def get_video_url(self, info, is_original): if video_link != u'无': video_link = video_link.replace( 'm.weibo.cn/s/video/show', 'm.weibo.cn/s/video/object') - wb_info = requests.get(video_link, - cookies=self.cookie).json() - video_url = wb_info['data']['object']['stream'].get( - 'hd_url') - if not video_url: - video_url = wb_info['data']['object']['stream']['url'] - if not video_url: # 说明该视频为直播 - video_url = u'无' + try: + wb_info = requests.get(video_link, + cookies=self.cookie).json() + video_url = wb_info['data']['object']['stream'].get( + 'hd_url') + if not video_url: + video_url = wb_info['data']['object']['stream'][ + 'url'] + if not video_url: # 说明该视频为直播 + video_url = u'无' + except json.decoder.JSONDecodeError: + logger.warning(u'当前账号没有浏览该视频的权限') return video_url except Exception as e: logger.exception(e) From d224940d1a5d64b14607b86df480b16f1887a796 Mon Sep 17 00:00:00 2001 From: moqi Date: Sun, 18 Oct 2020 16:37:29 +0800 Subject: [PATCH 288/363] git ignore .idea --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0dc0cdf5..77108998 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,6 @@ dist/ config.json weibo/ -*.log \ No newline at end of file +*.log + +.idea From 406c810145fca606b639bf39c72040bd7cf0cf04 Mon Sep 17 00:00:00 2001 From: moqi Date: Mon, 19 Oct 2020 10:13:25 +0800 Subject: [PATCH 289/363] =?UTF-8?q?=E5=B0=86=20MySQL=20=E5=BA=93=E5=BB=BA?= =?UTF-8?q?=E8=A1=A8=E8=AF=AD=E5=8F=A5=E7=9A=84=E6=AD=A3=E6=96=87=E9=95=BF?= =?UTF-8?q?=E5=BA=A6=E4=B8=8A=E9=99=90=E6=8F=90=E9=AB=98=E5=88=B0=205000?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/writer/mysql_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weibo_spider/writer/mysql_writer.py b/weibo_spider/writer/mysql_writer.py index f731c464..83831745 100644 --- a/weibo_spider/writer/mysql_writer.py +++ b/weibo_spider/writer/mysql_writer.py @@ -85,7 +85,7 @@ def write_weibo(self, weibos): CREATE TABLE IF NOT EXISTS weibo ( id varchar(10) NOT NULL, user_id varchar(12), - content varchar(2000), + content varchar(5000), article_url varchar(200), original_pictures varchar(3000), retweet_pictures varchar(3000), From 3d042898f123946dc10280287508456f70ebd074 Mon Sep 17 00:00:00 2001 From: dataabc Date: Mon, 9 Nov 2020 22:39:18 +0800 Subject: [PATCH 290/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E5=BE=AE?= =?UTF-8?q?=E5=8D=9A=E8=AE=BF=E9=97=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #253 --- weibo_spider/parser/page_parser.py | 7 ++++++- weibo_spider/parser/util.py | 12 +++++++----- weibo_spider/spider.py | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index ccfb273c..6bcc32a6 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -273,8 +273,13 @@ def get_video_url(self, info, is_original): video_link = video_link.replace( 'm.weibo.cn/s/video/show', 'm.weibo.cn/s/video/object') try: + user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' + headers = { + 'User_Agent': user_agent, + 'Cookie': self.cookie + } wb_info = requests.get(video_link, - cookies=self.cookie).json() + headers=headers).json() video_url = wb_info['data']['object']['stream'].get( 'hd_url') if not video_url: diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py index b05efe1a..52e39712 100644 --- a/weibo_spider/parser/util.py +++ b/weibo_spider/parser/util.py @@ -19,7 +19,9 @@ def hash_url(url): def handle_html(cookie, url): """处理html""" try: - resp = requests.get(url, cookies=cookie) + user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' + headers = {'User_Agent': user_agent, 'Cookie': cookie} + resp = requests.get(url, headers=headers) if GENERATE_TEST_DATA: import io @@ -59,10 +61,10 @@ def bid2mid(bid): base = len(alphabet) bidlen = len(bid) head = bidlen % 4 - digit = int((bidlen-head)/4) + digit = int((bidlen - head) / 4) dlist = [bid[0:head]] - for d in range(1,digit+1): - dlist.append(bid[head:head+d*4]) + for d in range(1, digit + 1): + dlist.append(bid[head:head + d * 4]) head += 4 mid = '' for d in dlist: @@ -71,7 +73,7 @@ def bid2mid(bid): strlen = len(d) for char in d: power = (strlen - (idx + 1)) - num += alphabet.index(char) * (base ** power) + num += alphabet.index(char) * (base**power) idx += 1 strnum = str(num) while (len(d) == 4 and len(strnum) < 7): diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 9c26e1e4..82505b3c 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -61,7 +61,7 @@ def __init__(self, config): 'pic_download'] # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 self.video_download = config[ 'video_download'] # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 - self.cookie = {'Cookie': config['cookie']} + self.cookie = config['cookie'] self.mysql_config = config.get('mysql_config') # MySQL数据库连接配置,可以不填 self.sqlite_config = config.get('sqlite_config') From 5e29c2834c656673dafd609246e52a764cf5088c Mon Sep 17 00:00:00 2001 From: dataabc Date: Mon, 9 Nov 2020 23:18:39 +0800 Subject: [PATCH 291/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8Dtest=E9=94=99?= =?UTF-8?q?=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_parser/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_parser/util.py b/tests/test_parser/util.py index 19292ecc..959a1dca 100644 --- a/tests/test_parser/util.py +++ b/tests/test_parser/util.py @@ -1,11 +1,11 @@ -from unittest.mock import Mock import json import os +from unittest.mock import Mock from weibo_spider.parser.util import TEST_DATA_DIR, URL_MAP_FILE -def mock_request_get_content(url, cookies): +def mock_request_get_content(url, headers): with open(os.path.join(TEST_DATA_DIR, URL_MAP_FILE)) as f: url_map = json.loads(f.read()) resp_file = url_map[url] From 27eb176fb8fad2b7465fc61aecdc4adbc70fb9b6 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Sat, 21 Nov 2020 23:33:12 +0800 Subject: [PATCH 292/363] Minor typo fix. --- weibo_spider/config_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 48b00ebb..691eb52c 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -92,7 +92,7 @@ def validate_config(config): for mode in config['write_mode']: if mode not in write_mode: logger.warning( - u'%s为无效模式,请从txt、csv、json、mongo和mysql、sqlite中挑选一个或多个作为write_mode', + u'%s为无效模式,请从txt、csv、json、mongo、sqlite和mysql中挑选一个或多个作为write_mode', mode) sys.exit() From 5d29f10b4b29eb45be5c44c984849e6b99c71e14 Mon Sep 17 00:00:00 2001 From: dataabc Date: Tue, 1 Dec 2020 19:29:43 +0800 Subject: [PATCH 293/363] =?UTF-8?q?feat:=20=20=E6=B7=BB=E5=8A=A0=E9=85=8D?= =?UTF-8?q?=E7=BD=AE=E7=BB=93=E6=9E=9C=E7=9B=AE=E5=BD=95=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E5=90=8D=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 通过result_dir_name配置,值为0,以昵称为结果目录名,更容易识别不同用户;值为1,以id为结果目录名,使前后结果更一致,因为昵称可能改变,id不会改变。 Issue #260 --- setup.py | 2 +- weibo_spider/config_sample.json | 1 + weibo_spider/spider.py | 10 +++++++--- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 30873b5a..5f721573 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='weibo-spider', - version='0.1.4', + version='0.1.6', author='Chen Lei', author_email='chillychen1991@gmail.com', description='新浪微博爬虫,用python爬取新浪微博数据。', diff --git a/weibo_spider/config_sample.json b/weibo_spider/config_sample.json index f5e9f764..d5d6ba88 100644 --- a/weibo_spider/config_sample.json +++ b/weibo_spider/config_sample.json @@ -9,6 +9,7 @@ "write_mode": ["csv", "txt"], "pic_download": 1, "video_download": 1, + "result_dir_name": 0, "cookie": "your cookie", "mysql_config": { "host": "localhost", diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 82505b3c..09a64d89 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -61,6 +61,8 @@ def __init__(self, config): 'pic_download'] # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 self.video_download = config[ 'video_download'] # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 + self.result_dir_name = config.get( + 'result_dir_name', 0) # 结果目录名,取值为0或1,决定结果文件存储在用户昵称文件夹里还是用户id文件夹里 self.cookie = config['cookie'] self.mysql_config = config.get('mysql_config') # MySQL数据库连接配置,可以不填 @@ -181,11 +183,13 @@ def get_weibo_info(self): def _get_filepath(self, type): """获取结果文件路径""" try: + dir_name = self.user.nickname + if self.result_dir_name: + dir_name = self.user.id if FLAGS.output_dir is not None: - file_dir = FLAGS.output_dir + os.sep + self.user.nickname + file_dir = FLAGS.output_dir + os.sep + dir_name else: - file_dir = (os.getcwd() + os.sep + 'weibo' + os.sep + - self.user.nickname) + file_dir = (os.getcwd() + os.sep + 'weibo' + os.sep + dir_name) if type == 'img' or type == 'video': file_dir = file_dir + os.sep + type if not os.path.isdir(file_dir): From bb79fef69b10f320bc1b76d2330874981ea0e3fb Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 1 Dec 2020 19:52:27 +0800 Subject: [PATCH 294/363] Update settings.md --- docs/settings.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/settings.md b/docs/settings.md index 3ae83be9..50d56a4f 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -16,6 +16,7 @@ $ python3 -m weibo_spider "write_mode": ["csv", "txt"], "pic_download": 1, "video_download": 1, + "result_dir_name": 0, "cookie": "your cookie", "mysql_config": { "host": "localhost", @@ -88,6 +89,12 @@ video_download控制是否下载微博中的视频,值为1代表下载,值 "video_download": 1, ``` 代表下载微博中的视频。
+**设置result_dir_name**
+result_dir_name控制结果目录的名字,可选值为0和1,默认值为0: +``` +"result_dir_name": 0, +``` +值为0表示将结果文件保存在以用户昵称为名的文件夹里,这样结果更清晰;值为1表示将结果保存在以用户id为名的文件夹里,这样更能保证多次爬取的一致性,因为用户昵称可以改变,用户id是不变的。
**设置cookie**
请按照[如何获取cookie](https://github.com/dataabc/weiboSpider/blob/master/docs/cookie.md),获取cookie,然后将“your cookie”替换成真实的cookie值。
**设置mysql_config(可选)**
From ce3f3123e8afaf53d2bd96e206e10d82846612e7 Mon Sep 17 00:00:00 2001 From: windlively Date: Fri, 4 Dec 2020 17:02:26 +0800 Subject: [PATCH 295/363] =?UTF-8?q?feature:=20=E5=A2=9E=E5=8A=A0kafka=20wr?= =?UTF-8?q?iter=EF=BC=8C=E6=94=AF=E6=8C=81=E5=90=91kafka=E5=AE=9E=E6=97=B6?= =?UTF-8?q?=E6=8E=A8=E9=80=81=E7=88=AC=E5=8F=96=E7=9A=84=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/config_sample.json | 5 +++++ weibo_spider/config_util.py | 4 ++-- weibo_spider/spider.py | 7 ++++++- weibo_spider/writer/__init__.py | 3 ++- weibo_spider/writer/kafka_writer.py | 31 +++++++++++++++++++++++++++++ 5 files changed, 46 insertions(+), 4 deletions(-) create mode 100644 weibo_spider/writer/kafka_writer.py diff --git a/weibo_spider/config_sample.json b/weibo_spider/config_sample.json index d5d6ba88..6f028faa 100644 --- a/weibo_spider/config_sample.json +++ b/weibo_spider/config_sample.json @@ -18,5 +18,10 @@ "password": "123456", "charset": "utf8mb4" }, + "kafka_config": { + "bootstrap-server": "127.0.0.1:9092", + "weibo_topics": ["spider_weibo"], + "user_topics": ["spider_weibo"] + }, "sqlite_config": "weibo.db" } diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 691eb52c..55e4bdd8 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -85,14 +85,14 @@ def validate_config(config): sys.exit() # 验证write_mode - write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql', 'sqlite'] + write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql', 'sqlite', 'kafka'] if not isinstance(config['write_mode'], list): logger.warning(u'write_mode值应为list类型') sys.exit() for mode in config['write_mode']: if mode not in write_mode: logger.warning( - u'%s为无效模式,请从txt、csv、json、mongo、sqlite和mysql中挑选一个或多个作为write_mode', + u'%s为无效模式,请从txt、csv、json、mongo、sqlite, kafka和mysql中挑选一个或多个作为write_mode', mode) sys.exit() diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 09a64d89..0e85669f 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -67,7 +67,7 @@ def __init__(self, config): self.mysql_config = config.get('mysql_config') # MySQL数据库连接配置,可以不填 self.sqlite_config = config.get('sqlite_config') - + self.kafka_config = config.get('kafka_config') self.user_config_file_path = '' user_id_list = config['user_id_list'] if FLAGS.user_id_list: @@ -238,6 +238,11 @@ def initialize_info(self, user_config): self.writers.append(SqliteWriter(self.sqlite_config)) + if 'kafka' in self.write_mode: + from .writer import KafkaWriter + + self.writers.append(KafkaWriter(self.kafka_config)) + self.downloaders = [] if self.pic_download == 1: from .downloader import ImgDownloader diff --git a/weibo_spider/writer/__init__.py b/weibo_spider/writer/__init__.py index 024f9937..5868f1ac 100644 --- a/weibo_spider/writer/__init__.py +++ b/weibo_spider/writer/__init__.py @@ -4,5 +4,6 @@ from .mysql_writer import MySqlWriter from .txt_writer import TxtWriter from .sqlite_writer import SqliteWriter +from .kafka_writer import KafkaWriter -__all__ = [CsvWriter, TxtWriter, JsonWriter, MongoWriter, MySqlWriter, SqliteWriter] +__all__ = [CsvWriter, TxtWriter, JsonWriter, MongoWriter, MySqlWriter, SqliteWriter, KafkaWriter] diff --git a/weibo_spider/writer/kafka_writer.py b/weibo_spider/writer/kafka_writer.py new file mode 100644 index 00000000..ce0bc48e --- /dev/null +++ b/weibo_spider/writer/kafka_writer.py @@ -0,0 +1,31 @@ +import json +import logging + +from kafka import KafkaProducer + +from .writer import Writer +logger = logging.getLogger('spider.mysql_writer') + + +class KafkaWriter(Writer): + + def __init__(self, kafka_config): + self.kafka_config = kafka_config + self.producer = KafkaProducer(bootstrap_servers=str(kafka_config['bootstrap-server']).split(','), + value_serializer=lambda m: json.dumps(m,ensure_ascii=False).encode('UTF-8')) + self.weibo_topics = list(kafka_config['weibo_topics']) + self.user_topics = list(kafka_config['user_topics']) + logger.info('{}', kafka_config) + + def write_weibo(self, weibo): + for w in weibo: + for topic in self.weibo_topics: + self.producer.send(topic, value=w.__dict__) + + def write_user(self, user): + for topic in self.user_topics: + self.producer.send(topic, value=user.__dict__) + + def __del__(self): + self.producer.close() + From 03c8c8925fd2110449331f7660f6728cd63b0907 Mon Sep 17 00:00:00 2001 From: baijunhan <47207854+baijunhan@users.noreply.github.com> Date: Fri, 4 Dec 2020 18:50:51 +0800 Subject: [PATCH 296/363] Update weibo_spider/writer/kafka_writer.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: KafkaWriter日志错误 Co-authored-by: Chen Lei --- weibo_spider/writer/kafka_writer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/weibo_spider/writer/kafka_writer.py b/weibo_spider/writer/kafka_writer.py index ce0bc48e..a20c54c6 100644 --- a/weibo_spider/writer/kafka_writer.py +++ b/weibo_spider/writer/kafka_writer.py @@ -4,7 +4,7 @@ from kafka import KafkaProducer from .writer import Writer -logger = logging.getLogger('spider.mysql_writer') +logger = logging.getLogger('spider.kafka_writer') class KafkaWriter(Writer): @@ -28,4 +28,3 @@ def write_user(self, user): def __del__(self): self.producer.close() - From 669bf2ceb1efbdb1ffe8e2413afa9459fd0b9974 Mon Sep 17 00:00:00 2001 From: dataabc Date: Sat, 5 Dec 2020 21:00:03 +0800 Subject: [PATCH 297/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=B2=A1?= =?UTF-8?q?=E6=9C=89=E5=AE=89=E8=A3=85kafka=E8=80=8C=E5=87=BA=E9=94=99?= =?UTF-8?q?=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #266 --- setup.py | 2 +- weibo_spider/writer/kafka_writer.py | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 5f721573..eea943e6 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='weibo-spider', - version='0.1.6', + version='0.1.7', author='Chen Lei', author_email='chillychen1991@gmail.com', description='新浪微博爬虫,用python爬取新浪微博数据。', diff --git a/weibo_spider/writer/kafka_writer.py b/weibo_spider/writer/kafka_writer.py index a20c54c6..64ef4749 100644 --- a/weibo_spider/writer/kafka_writer.py +++ b/weibo_spider/writer/kafka_writer.py @@ -1,18 +1,26 @@ import json import logging - -from kafka import KafkaProducer +import sys from .writer import Writer + logger = logging.getLogger('spider.kafka_writer') class KafkaWriter(Writer): - def __init__(self, kafka_config): + try: + from kafka import KafkaProducer + except ImportError: + logger.warning( + u'系统中可能没有安装kafka库,请先运行 pip install kafka-python ,再运行程序') + sys.exit() + self.kafka_config = kafka_config - self.producer = KafkaProducer(bootstrap_servers=str(kafka_config['bootstrap-server']).split(','), - value_serializer=lambda m: json.dumps(m,ensure_ascii=False).encode('UTF-8')) + self.producer = KafkaProducer( + bootstrap_servers=str(kafka_config['bootstrap-server']).split(','), + value_serializer=lambda m: json.dumps(m, ensure_ascii=False + ).encode('UTF-8')) self.weibo_topics = list(kafka_config['weibo_topics']) self.user_topics = list(kafka_config['user_topics']) logger.info('{}', kafka_config) From e8cacd3786100ccf4806d28142911a232513f952 Mon Sep 17 00:00:00 2001 From: eggachecat Date: Wed, 9 Dec 2020 15:22:53 +0800 Subject: [PATCH 298/363] =?UTF-8?q?feat:=20user=5Fid=5Flist=E5=9C=A8config?= =?UTF-8?q?.json=E6=94=AF=E6=8C=81=E6=B7=B7=E5=90=88=E7=9A=84=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fuck --- weibo_spider/spider.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 0e85669f..c24f947f 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -82,12 +82,20 @@ def __init__(self, config): if FLAGS.u: user_id_list = FLAGS.u.split(',') if isinstance(user_id_list, list): - user_id_list = list(set(user_id_list)) - user_config_list = [{ - 'user_uri': user_id, + # 第一部分是处理dict类型的 + # 第二部分是其他类型,其他类型提供去重功能 + user_config_list = list(map( + lambda x: { + 'user_uri': x['id'], + 'since_date': x.get('since_date', self.since_date), + 'end_date': x.get('end_date', self.end_date), + }, [user_id for user_id in user_id_list if isinstance(user_id, dict)] + )) + list(map(lambda x: { + 'user_uri': x, 'since_date': self.since_date, 'end_date': self.end_date - } for user_id in user_id_list] + }, set([user_id for user_id in user_id_list if not isinstance(user_id, dict)])) + ) if FLAGS.u: config_util.add_user_uri_list(self.user_config_file_path, user_id_list) @@ -147,7 +155,7 @@ def get_weibo_info(self): weibos, self.weibo_id_list, to_continue = PageParser( self.cookie, self.user_config, page, self.filter).get_one_page( - self.weibo_id_list) # 获取第page页的全部微博 + self.weibo_id_list) # 获取第page页的全部微博 logger.info( u'%s已获取%s(%s)的第%d页微博%s', '-' * 30, From 50034ce19012752d8124eda8cd4a8c3140eed583 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Thu, 10 Dec 2020 14:17:45 +0800 Subject: [PATCH 299/363] Update example.md --- docs/example.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/example.md b/docs/example.md index 56c33fda..1d4519a5 100644 --- a/docs/example.md +++ b/docs/example.md @@ -9,12 +9,13 @@ "write_mode": ["csv", "txt", "json"], "pic_download": 1, "video_download": 1, + "result_dir_name": 0, "cookie": "your cookie" } ``` 对于上述参数的含义以及取值范围,这里仅作简单介绍,详细信息见[程序设置](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md)。 ->**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](https://github.com/dataabc/weiboSpider/blob/master/docs/userid.md);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**end_date**代表我们要爬取end_date日期之前发布的微博,since_date配合end_date,表示我们要爬取发布日期在since_date和end_date之间的微博,包含边界,如果end_date值为"now",表示爬取发布日期从since_date到现在的微博;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件、csv文件和json文件,所以它的值为["csv", "txt", "json"],如果你想写入数据库,具体设置见[设置数据库](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md#设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[cookie文档](https://github.com/dataabc/weiboSpider/blob/master/docs/cookie.md),获取cookie后把"your cookie"替换成真实的cookie值即可。
+>**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](https://github.com/dataabc/weiboSpider/blob/master/docs/userid.md);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**end_date**代表我们要爬取end_date日期之前发布的微博,since_date配合end_date,表示我们要爬取发布日期在since_date和end_date之间的微博,包含边界,如果end_date值为"now",表示爬取发布日期从since_date到现在的微博;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件、csv文件和json文件,所以它的值为["csv", "txt", "json"],如果你想写入数据库,具体设置见[设置数据库](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md#设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**result_dir_name**控制结果文件夹名,值为1代表文件夹名是用户id,值为0代表文件夹名是用户昵称;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[cookie文档](https://github.com/dataabc/weiboSpider/blob/master/docs/cookie.md),获取cookie后把"your cookie"替换成真实的cookie值即可。
cookie修改完成后在weiboSpider目录下运行如下命令: ```bash From 3b21f767d5356260ea3bc8a28a952a399c9dec18 Mon Sep 17 00:00:00 2001 From: dataabc Date: Wed, 16 Dec 2020 20:36:29 +0800 Subject: [PATCH 300/363] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E4=B8=8B=E8=BD=BD=E8=B6=85=E6=97=B6=E6=8E=A7=E5=88=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 通过config.json的file_download_timeout参数控制,该参数是列表形式,包含三个数字,第一个代表最多重试数,第二个代表最大连接时间,第三个代表最大读取时间 Issue #267 --- setup.py | 2 +- weibo_spider/config_sample.json | 1 + weibo_spider/downloader/downloader.py | 18 +++++++-- weibo_spider/downloader/img_downloader.py | 4 +- weibo_spider/downloader/video_downloader.py | 4 +- weibo_spider/spider.py | 45 ++++++++++++++------- 6 files changed, 50 insertions(+), 24 deletions(-) diff --git a/setup.py b/setup.py index eea943e6..014f600a 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='weibo-spider', - version='0.1.7', + version='0.1.8', author='Chen Lei', author_email='chillychen1991@gmail.com', description='新浪微博爬虫,用python爬取新浪微博数据。', diff --git a/weibo_spider/config_sample.json b/weibo_spider/config_sample.json index 6f028faa..b2652dfa 100644 --- a/weibo_spider/config_sample.json +++ b/weibo_spider/config_sample.json @@ -9,6 +9,7 @@ "write_mode": ["csv", "txt"], "pic_download": 1, "video_download": 1, + "file_download_timeout": [5, 5, 10], "result_dir_name": 0, "cookie": "your cookie", "mysql_config": { diff --git a/weibo_spider/downloader/downloader.py b/weibo_spider/downloader/downloader.py index 63b1ee26..4a07d67b 100644 --- a/weibo_spider/downloader/downloader.py +++ b/weibo_spider/downloader/downloader.py @@ -12,10 +12,17 @@ class Downloader(ABC): - def __init__(self, file_dir): + def __init__(self, file_dir, file_download_timeout): self.file_dir = file_dir - self.describe = u'' + self.describe = '' self.key = '' + self.file_download_timeout = [5, 5, 10] + if (isinstance(file_download_timeout, list) + and len(file_download_timeout) == 3): + for i in range(3): + v = file_download_timeout[i] + if isinstance(v, (int, float)) and v > 0: + self.file_download_timeout[i] = v @abstractmethod def handle_download(self, urls, w): @@ -27,8 +34,11 @@ def download_one_file(self, url, file_path, weibo_id): try: if not os.path.isfile(file_path): s = requests.Session() - s.mount(url, HTTPAdapter(max_retries=5)) - downloaded = s.get(url, timeout=(5, 10)) + s.mount(url, + HTTPAdapter(max_retries=self.file_download_timeout[0])) + downloaded = s.get(url, + timeout=(self.file_download_timeout[1], + self.file_download_timeout[2])) with open(file_path, 'wb') as f: f.write(downloaded.content) except Exception as e: diff --git a/weibo_spider/downloader/img_downloader.py b/weibo_spider/downloader/img_downloader.py index 88274062..7655425c 100644 --- a/weibo_spider/downloader/img_downloader.py +++ b/weibo_spider/downloader/img_downloader.py @@ -4,8 +4,8 @@ class ImgDownloader(Downloader): - def __init__(self, file_dir): - self.file_dir = file_dir + def __init__(self, file_dir, file_download_timeout): + super().__init__(file_dir, file_download_timeout) self.describe = u'图片' self.key = 'original_pictures' diff --git a/weibo_spider/downloader/video_downloader.py b/weibo_spider/downloader/video_downloader.py index cc0171ba..3e9953de 100644 --- a/weibo_spider/downloader/video_downloader.py +++ b/weibo_spider/downloader/video_downloader.py @@ -4,8 +4,8 @@ class VideoDownloader(Downloader): - def __init__(self, file_dir): - self.file_dir = file_dir + def __init__(self, file_dir, file_download_timeout): + super().__init__(file_dir, file_download_timeout) self.describe = u'视频' self.key = 'video_url' diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index c24f947f..33bf3163 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -61,6 +61,10 @@ def __init__(self, config): 'pic_download'] # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 self.video_download = config[ 'video_download'] # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 + self.file_download_timeout = config.get( + 'file_download_timeout', + [5, 5, 10 + ]) # 控制文件下载“超时”时的操作,值是list形式,包含三个数字,依次分别是最大超时重试次数、最大连接时间和最大读取时间 self.result_dir_name = config.get( 'result_dir_name', 0) # 结果目录名,取值为0或1,决定结果文件存储在用户昵称文件夹里还是用户id文件夹里 self.cookie = config['cookie'] @@ -84,18 +88,26 @@ def __init__(self, config): if isinstance(user_id_list, list): # 第一部分是处理dict类型的 # 第二部分是其他类型,其他类型提供去重功能 - user_config_list = list(map( - lambda x: { - 'user_uri': x['id'], - 'since_date': x.get('since_date', self.since_date), - 'end_date': x.get('end_date', self.end_date), - }, [user_id for user_id in user_id_list if isinstance(user_id, dict)] - )) + list(map(lambda x: { - 'user_uri': x, - 'since_date': self.since_date, - 'end_date': self.end_date - }, set([user_id for user_id in user_id_list if not isinstance(user_id, dict)])) - ) + user_config_list = list( + map( + lambda x: { + 'user_uri': x['id'], + 'since_date': x.get('since_date', self.since_date), + 'end_date': x.get('end_date', self.end_date), + }, [ + user_id for user_id in user_id_list + if isinstance(user_id, dict) + ])) + list( + map( + lambda x: { + 'user_uri': x, + 'since_date': self.since_date, + 'end_date': self.end_date + }, + set([ + user_id for user_id in user_id_list + if not isinstance(user_id, dict) + ]))) if FLAGS.u: config_util.add_user_uri_list(self.user_config_file_path, user_id_list) @@ -155,7 +167,7 @@ def get_weibo_info(self): weibos, self.weibo_id_list, to_continue = PageParser( self.cookie, self.user_config, page, self.filter).get_one_page( - self.weibo_id_list) # 获取第page页的全部微博 + self.weibo_id_list) # 获取第page页的全部微博 logger.info( u'%s已获取%s(%s)的第%d页微博%s', '-' * 30, @@ -255,12 +267,15 @@ def initialize_info(self, user_config): if self.pic_download == 1: from .downloader import ImgDownloader - self.downloaders.append(ImgDownloader(self._get_filepath('img'))) + self.downloaders.append( + ImgDownloader(self._get_filepath('img'), + self.file_download_timeout)) if self.video_download == 1: from .downloader import VideoDownloader self.downloaders.append( - VideoDownloader(self._get_filepath('video'))) + VideoDownloader(self._get_filepath('video'), + self.file_download_timeout)) def start(self): """运行爬虫""" From 95613a439be5af26a9ac7d8627a21e93bb77c152 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Tue, 22 Dec 2020 14:17:38 +0800 Subject: [PATCH 301/363] Update settings.md --- docs/settings.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/settings.md b/docs/settings.md index 50d56a4f..bdf48d50 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -64,7 +64,8 @@ since_date值可以是日期,也可以是整数。如果是日期,代表爬 代表爬取最近10天的微博,这个说法不是特别准确,准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。
**since_date是所有user的爬取起始时间,非常不灵活。如果你要爬多个用户,并且想单独为每个用户设置一个since_date,可以使用[定期自动爬取微博](https://github.com/dataabc/weiboSpider/blob/master/docs/automation.md)方法二中的方法,该方法可以为多个用户设置不同的since_date,非常灵活。**
**设置end_date**
-end_date值可以是日期,也可以是"now"。如果是日期,代表爬取该日期之前的微博,格式应为“yyyy-mm-dd”;如果是"now",代表爬取发布日期从since_date到现在的微博。since_date配合end_date,表示爬取发布日期在since_date和end_date之间的微博,包含边界。since_date是起始日期,end_date是结束日期,因此end_date时间应晚于since_date。注意,since_date即可以通过config.json文件的since_date参数设置,也可以通过user_id_list.txt设置;而end_date只能通过config.json文件的end_date参数设置,是全局变量,所有user_id都使用同一个end_date。当end_date值不是"now"时,程序无法获取微博中的视频,如果想要获取视频,请为end_date赋值为"now"。
+end_date值可以是日期,也可以是"now"。如果是日期,代表爬取该日期之前的微博,格式应为“yyyy-mm-dd”;如果是"now",代表爬取发布日期从since_date到现在的微博。since_date配合end_date,表示爬取发布日期在since_date和end_date之间的微博,包含边界。since_date是起始日期,end_date是结束日期,因此end_date时间应晚于since_date。注意,since_date即可以通过config.json文件的since_date参数设置,也可以通过user_id_list.txt设置;而end_date只能通过config.json文件的end_date参数设置,是全局变量,所有user_id都使用同一个end_date。
+**推荐使用"now"作为end_date值**,当值为"now"时,获取结果是正确和稳定的;当end_date值不是"now"时,在爬微博数非常多的账号时,程序可能不稳定,得到很多空微博页,并且此时无法获取微博中的视频,如果想要获取视频,请为end_date赋值为"now"。
**设置random_wait_pages**
random_wait_pages值是一个长度为2的整数列表,代表每爬取x页微博暂停一次,x为整数,值在random_wait_pages列表两个整数之间随机获取。默认值为[1, 5],代表每爬取1到5页暂停一次,如果程序被限制,可以加快暂停频率,即适当减小random_wait_pages内的值。
**设置random_wait_seconds**
From 617a91011e47c1fd1fbfc29f478b9a5129d006b8 Mon Sep 17 00:00:00 2001 From: dataabc Date: Tue, 29 Dec 2020 19:46:57 +0800 Subject: [PATCH 302/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96user=5Fid=5Fl?= =?UTF-8?q?ist.txt=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #286 --- weibo_spider/spider.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 33bf3163..dd7abca1 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -197,6 +197,15 @@ def get_weibo_info(self): sleep(1) self.page_count = 0 self.global_wait.append(self.global_wait.pop(0)) + + # 更新用户user_id_list.txt中的since_date + if self.user_config_file_path or FLAGS.u: + config_util.update_user_config_file( + self.user_config_file_path, + self.user_config['user_uri'], + self.user.nickname, + self.new_since_date, + ) except Exception as e: logger.exception(e) @@ -310,14 +319,6 @@ def start(self): logger.info(u'共爬取' + str(self.got_num) + u'条原创微博') logger.info(u'信息抓取完毕') logger.info('*' * 100) - - if self.user_config_file_path or FLAGS.u: - config_util.update_user_config_file( - self.user_config_file_path, - self.user_config['user_uri'], - self.user.nickname, - self.new_since_date, - ) except Exception as e: logger.exception(e) From 819c83a69a1fbc31cbd2242691378b9d366abed0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 7 Jan 2021 23:02:39 +0000 Subject: [PATCH 303/363] build(deps): bump lxml from 4.5.1 to 4.6.2 Bumps [lxml](https://github.com/lxml/lxml) from 4.5.1 to 4.6.2. - [Release notes](https://github.com/lxml/lxml/releases) - [Changelog](https://github.com/lxml/lxml/blob/master/CHANGES.txt) - [Commits](https://github.com/lxml/lxml/compare/lxml-4.5.1...lxml-4.6.2) Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6092b299..fde7f1f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -lxml==4.5.1 +lxml==4.6.2 requests==2.23.0 tqdm==4.46.1 absl-py==0.9.0 \ No newline at end of file From a00d4699fec12c01248e4de82462e948dbdc149e Mon Sep 17 00:00:00 2001 From: dataabc Date: Fri, 8 Jan 2021 18:52:27 +0800 Subject: [PATCH 304/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html b/tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html index 1dfb4881..1101efb5 100644 --- a/tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html +++ b/tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html @@ -1 +1 @@ -Dear-迪丽热巴的微博
Dear-迪丽热巴的微博 加关注
 微博  相册 
生日动态  
赞[1499675] 转发[1000000] 评论[1000000] 收藏 06月03日 00:00 来自生日动态
炎炎夏日让每天的沐浴时光都变得尤其重要,精致的沙龙香相伴让沐浴也可以成为清新浪漫的享受!给大家@LUX力士 的沐浴小秘密分享,有力士植萃沐浴露,把沐浴变成“仪式感”!我的心选好物分享给你们啦 [笑而不语] LUX力士的微博视频  
赞[377578] 转发[1000000] 评论[1000000] 收藏 05月31日 10:59
#idoltube##周放vlog# 第二篇来啦!今天邀请大家走进生活,走进幸福的放放子一家~[喵喵]#幸福触手可及# Dear-迪丽热巴的微博视频  
赞[397970] 转发[1000000] 评论[1000000] 收藏 05月30日 19:02 来自国产剧集 · 视频社区
@法国娇韵诗 收到宠爱了~小娇的618#娇宠你有一套#,早晚护肤都靠它,超级喜欢这份宠爱!现在给全体爱丽丝们施法,希望你们都可以拥有这份让你变美的娇宠礼物哦~同款娇宠http://t.cn/A62cgDJp一起享用!  [组图共2张]
#微博剧场# 我为4A景区代言,酷飒周放的追剧邀请,你来吗? #4A景区触手可及#
@路易威登 PONT 9 手袋 陪你摩登一夏[嘻嘻]#LVPONT9#  [组图共3张]
#热巴手稿填色大赛#服装手稿填色游戏正式开启!图一出自迪迪子,图二出自放放子。迪迪子的面子就靠大家的后期填色了[微笑] 绿洲  [组图共2张]
图片 原图 
赞[733671] 转发[1000000] 评论[1000000] 收藏 05月27日 14:48 来自绿洲APP
转发了 护舒宝VM 的微博:还记得和宝宝陪着@Dear-迪丽热巴 走过的花路吗?谢谢阿丝们一直以来的陪伴[太开心][太开心]~为你甄选护舒宝天然纯棉卫生巾,给你透气亲肤的体验。现在上天猫超市购买,1套减25,第2套只要19.9。未来的花路,和宝宝一起用好物,守护热巴!#迪丽热巴[超话]#
图片 原图 赞[43521] 原文转发[1000000] 原文评论[13967]
转发理由:谢谢@护舒宝 和阿丝们的守护,每一刻都非常有意义。未来请继续指教啦~  
赞[418834] 转发[1000000] 评论[1000000] 收藏 05月26日 11:14
#idoltube##周放vlog# 放放子的第一支搞事业篇vlog已上线~约vlog的朋友们可以放下你们的号码牌了[可爱] #幸福触手可及# Dear-迪丽热巴的微博视频  
赞[450541] 转发[1000000] 评论[216934] 收藏 05月25日 20:53 来自影视剪辑 · 视频社区
下页 上页 首页  2/117页
TOP
彩版|触屏|语音
weibo.cn[06-19 00:47]
+Dear-迪丽热巴的微博
Dear-迪丽热巴的微博 加关注
 微博  相册 
生日动态  
赞[1499675] 转发[1000000] 评论[1000000] 收藏 2020-06-03 00:00 来自生日动态
炎炎夏日让每天的沐浴时光都变得尤其重要,精致的沙龙香相伴让沐浴也可以成为清新浪漫的享受!给大家@LUX力士 的沐浴小秘密分享,有力士植萃沐浴露,把沐浴变成“仪式感”!我的心选好物分享给你们啦 [笑而不语] LUX力士的微博视频  
赞[377578] 转发[1000000] 评论[1000000] 收藏 2020-05-31 10:59
#idoltube##周放vlog# 第二篇来啦!今天邀请大家走进生活,走进幸福的放放子一家~[喵喵]#幸福触手可及# Dear-迪丽热巴的微博视频  
赞[397970] 转发[1000000] 评论[1000000] 收藏 2020-05-30 19:02 来自国产剧集 · 视频社区
@法国娇韵诗 收到宠爱了~小娇的618#娇宠你有一套#,早晚护肤都靠它,超级喜欢这份宠爱!现在给全体爱丽丝们施法,希望你们都可以拥有这份让你变美的娇宠礼物哦~同款娇宠http://t.cn/A62cgDJp一起享用!  [组图共2张]
#微博剧场# 我为4A景区代言,酷飒周放的追剧邀请,你来吗? #4A景区触手可及#
@路易威登 PONT 9 手袋 陪你摩登一夏[嘻嘻]#LVPONT9#  [组图共3张]
#热巴手稿填色大赛#服装手稿填色游戏正式开启!图一出自迪迪子,图二出自放放子。迪迪子的面子就靠大家的后期填色了[微笑] 绿洲  [组图共2张]
图片 原图 
赞[733671] 转发[1000000] 评论[1000000] 收藏 2020-05-27 14:48 来自绿洲APP
转发了 护舒宝VM 的微博:还记得和宝宝陪着@Dear-迪丽热巴 走过的花路吗?谢谢阿丝们一直以来的陪伴[太开心][太开心]~为你甄选护舒宝天然纯棉卫生巾,给你透气亲肤的体验。现在上天猫超市购买,1套减25,第2套只要19.9。未来的花路,和宝宝一起用好物,守护热巴!#迪丽热巴[超话]#
图片 原图 赞[43521] 原文转发[1000000] 原文评论[13967]
转发理由:谢谢@护舒宝 和阿丝们的守护,每一刻都非常有意义。未来请继续指教啦~  
赞[418834] 转发[1000000] 评论[1000000] 收藏 2020-05-26 11:14
#idoltube##周放vlog# 放放子的第一支搞事业篇vlog已上线~约vlog的朋友们可以放下你们的号码牌了[可爱] #幸福触手可及# Dear-迪丽热巴的微博视频  
赞[450541] 转发[1000000] 评论[216934] 收藏 2020-05-25 20:53 来自影视剪辑 · 视频社区
下页 上页 首页  2/117页
TOP
彩版|触屏|语音
weibo.cn[06-19 00:47]
From 89b30e9db29fda77743d9f14c70db49e713e75a4 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 20 Jan 2021 19:14:59 +0800 Subject: [PATCH 305/363] Update README.md --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 530884d5..060a873a 100644 --- a/README.md +++ b/README.md @@ -8,12 +8,12 @@ 具体的写入文件类型如下: -- 写入**txt文件**(默认) -- 写入**csv文件**(默认) -- 写入**json文件**(可选) -- 写入**MySQL数据库**(可选) -- 写入**MongoDB数据库**(可选) -- 写入**SQLite数据库**(可选) +- **txt文件**(默认) +- **csv文件**(默认) +- **json文件**(可选) +- **MySQL数据库**(可选) +- **MongoDB数据库**(可选) +- **SQLite数据库**(可选) - 下载用户**原创**微博中的原始**图片**(可选) - 下载用户**转发**微博中的原始**图片**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) - 下载用户**原创**微博中的**视频**(可选) From ab2b02bd5b1e539e5897559f324220877389f5b5 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Wed, 20 Jan 2021 19:19:40 +0800 Subject: [PATCH 306/363] Update README.md --- README.md | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 060a873a..00ce520e 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ 本程序可以连续爬取**一个**或**多个**新浪微博用户(如[胡歌](https://weibo.cn/u/1223178222)、[迪丽热巴](https://weibo.cn/u/1669879400)、[郭碧婷](https://weibo.cn/u/1729370543))的数据,并将结果信息写入**文件**或**数据库**。写入信息几乎包括用户微博的所有数据,包括**用户信息**和**微博信息**两大类。因为内容太多,这里不再赘述,详细内容见[获取到的字段](#获取到的字段)。如果只需要用户信息,可以通过设置实现只爬取微博用户信息的功能。本程序需设置cookie来获取微博访问权限,后面会讲解[如何获取cookie](#如何获取cookie)。如果不想设置cookie,可以使用[免cookie版](https://github.com/dataabc/weibo-crawler),二者功能类似。 -具体的写入文件类型如下: +爬取结果可写入文件和数据库,具体的写入文件类型如下: - **txt文件**(默认) - **csv文件**(默认) @@ -14,12 +14,14 @@ - **MySQL数据库**(可选) - **MongoDB数据库**(可选) - **SQLite数据库**(可选) -- 下载用户**原创**微博中的原始**图片**(可选) -- 下载用户**转发**微博中的原始**图片**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) -- 下载用户**原创**微博中的**视频**(可选) -- 下载用户**转发**微博中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) -- 下载用户**原创**微博**Live Photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) -- 下载用户**转发**微博**Live Photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) + +同时支持下载微博中的图片和视频,具体的可下载文件如下: +- **原创**微博中的原始**图片**(可选) +- **转发**微博中的原始**图片**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) +- **原创**微博中的**视频**(可选) +- **转发**微博中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) +- **原创**微博**Live Photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) +- **转发**微博**Live Photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) ## 内容列表 From ce65ca54f1f118038727a0d12db08369188ab0aa Mon Sep 17 00:00:00 2001 From: dataabc Date: Fri, 19 Feb 2021 19:34:52 +0800 Subject: [PATCH 307/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E9=81=87?= =?UTF-8?q?=E5=88=B0=E6=97=A0=E9=99=90id=E7=BB=88=E6=AD=A2=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #301 --- setup.py | 2 +- weibo_spider/spider.py | 41 ++++++++++++++++++++++++----------------- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/setup.py b/setup.py index 014f600a..2f343744 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='weibo-spider', - version='0.1.8', + version='0.1.9', author='Chen Lei', author_email='chillychen1991@gmail.com', description='新浪微博爬虫,用python爬取新浪微博数据。', diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index dd7abca1..d5a17dae 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -286,6 +286,29 @@ def initialize_info(self, user_config): VideoDownloader(self._get_filepath('video'), self.file_download_timeout)) + def get_one_user(self, user_config): + """获取一个用户的微博""" + try: + self.get_user_info(user_config['user_uri']) + logger.info(self.user) + logger.info('*' * 100) + + self.initialize_info(user_config) + self.write_user(self.user) + logger.info('*' * 100) + + for weibos in self.get_weibo_info(): + self.write_weibo(weibos) + self.got_num += len(weibos) + if not self.filter: + logger.info(u'共爬取' + str(self.got_num) + u'条微博') + else: + logger.info(u'共爬取' + str(self.got_num) + u'条原创微博') + logger.info(u'信息抓取完毕') + logger.info('*' * 100) + except Exception as e: + logger.exception(e) + def start(self): """运行爬虫""" try: @@ -302,23 +325,7 @@ def start(self): user_count1 = user_count random_users = random.randint(*self.random_wait_pages) user_count += 1 - self.get_user_info(user_config['user_uri']) - logger.info(self.user) - logger.info('*' * 100) - - self.initialize_info(user_config) - self.write_user(self.user) - logger.info('*' * 100) - - for weibos in self.get_weibo_info(): - self.write_weibo(weibos) - self.got_num += len(weibos) - if not self.filter: - logger.info(u'共爬取' + str(self.got_num) + u'条微博') - else: - logger.info(u'共爬取' + str(self.got_num) + u'条原创微博') - logger.info(u'信息抓取完毕') - logger.info('*' * 100) + self.get_one_user(user_config) except Exception as e: logger.exception(e) From 14825714f2573acce64937d90e51c54962d262fc Mon Sep 17 00:00:00 2001 From: dataabc Date: Fri, 19 Feb 2021 19:42:34 +0800 Subject: [PATCH 308/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=9B=A0?= =?UTF-8?q?=E8=AE=BF=E9=97=AEmysql=E5=87=BA=E9=94=99=E8=80=8C=E7=BB=88?= =?UTF-8?q?=E6=AD=A2=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #301 --- weibo_spider/writer/mysql_writer.py | 104 +++++++++++++++------------- 1 file changed, 55 insertions(+), 49 deletions(-) diff --git a/weibo_spider/writer/mysql_writer.py b/weibo_spider/writer/mysql_writer.py index 83831745..7118c083 100644 --- a/weibo_spider/writer/mysql_writer.py +++ b/weibo_spider/writer/mysql_writer.py @@ -81,56 +81,62 @@ def _mysql_insert(self, table, data_list): def write_weibo(self, weibos): """将爬取的微博信息写入MySQL数据库""" # 创建'weibo'表 - create_table = """ - CREATE TABLE IF NOT EXISTS weibo ( - id varchar(10) NOT NULL, - user_id varchar(12), - content varchar(5000), - article_url varchar(200), - original_pictures varchar(3000), - retweet_pictures varchar(3000), - original BOOLEAN NOT NULL DEFAULT 1, - video_url varchar(300), - publish_place varchar(100), - publish_time DATETIME NOT NULL, - publish_tool varchar(30), - up_num INT NOT NULL, - retweet_num INT NOT NULL, - comment_num INT NOT NULL, - PRIMARY KEY (id) - ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" - self._mysql_create_table(create_table) - # 在'weibo'表中插入或更新微博数据 - weibo_list = [] - info_list = copy.deepcopy(weibos) - for weibo in info_list: - weibo.user_id = self.user.id - weibo_list.append(weibo.__dict__) - self._mysql_insert('weibo', weibo_list) - logger.info(u'%d条微博写入MySQL数据库完毕', len(weibos)) + try: + create_table = """ + CREATE TABLE IF NOT EXISTS weibo ( + id varchar(10) NOT NULL, + user_id varchar(12), + content varchar(5000), + article_url varchar(200), + original_pictures varchar(3000), + retweet_pictures varchar(3000), + original BOOLEAN NOT NULL DEFAULT 1, + video_url varchar(300), + publish_place varchar(100), + publish_time DATETIME NOT NULL, + publish_tool varchar(30), + up_num INT NOT NULL, + retweet_num INT NOT NULL, + comment_num INT NOT NULL, + PRIMARY KEY (id) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" + self._mysql_create_table(create_table) + # 在'weibo'表中插入或更新微博数据 + weibo_list = [] + info_list = copy.deepcopy(weibos) + for weibo in info_list: + weibo.user_id = self.user.id + weibo_list.append(weibo.__dict__) + self._mysql_insert('weibo', weibo_list) + logger.info(u'%d条微博写入MySQL数据库完毕', len(weibos)) + except Exception as e: + logger.exception(e) def write_user(self, user): """将爬取的用户信息写入MySQL数据库""" - self.user = user + try: + self.user = user - # 创建'user'表 - create_table = """ - CREATE TABLE IF NOT EXISTS user ( - id varchar(20) NOT NULL, - nickname varchar(30), - gender varchar(10), - location varchar(200), - birthday varchar(40), - description varchar(400), - verified_reason varchar(140), - talent varchar(200), - education varchar(200), - work varchar(200), - weibo_num INT, - following INT, - followers INT, - PRIMARY KEY (id) - ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" - self._mysql_create_table(create_table) - self._mysql_insert('user', [user.__dict__]) - logger.info(u'%s信息写入MySQL数据库完毕', user.nickname) + # 创建'user'表 + create_table = """ + CREATE TABLE IF NOT EXISTS user ( + id varchar(20) NOT NULL, + nickname varchar(30), + gender varchar(10), + location varchar(200), + birthday varchar(40), + description varchar(400), + verified_reason varchar(140), + talent varchar(200), + education varchar(200), + work varchar(200), + weibo_num INT, + following INT, + followers INT, + PRIMARY KEY (id) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4""" + self._mysql_create_table(create_table) + self._mysql_insert('user', [user.__dict__]) + logger.info(u'%s信息写入MySQL数据库完毕', user.nickname) + except Exception as e: + logger.exception(e) From 350c96b7e8147b018b6aeac4c2d0de9f033125e9 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Mon, 8 Mar 2021 19:20:30 +0800 Subject: [PATCH 309/363] Create academic.md --- docs/academic.md | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 docs/academic.md diff --git a/docs/academic.md b/docs/academic.md new file mode 100644 index 00000000..5da6b3f2 --- /dev/null +++ b/docs/academic.md @@ -0,0 +1,4 @@ +## 学术研究 + +本项目通过获取微博数据,为写论文、做研究等非商业项目提供所需数据。下面是一些在论文或研究等方面使用过本程序的项目。在一些涉及隐私的描述上,已与研究者做了沟通,在下面的描述中只介绍研究者 +允许展示的部分。如果部分信息研究者之前同意展示并且已经写在了本文档中,现在又不想展示了,可以通过邮件(chillychen1991@gmail.com)或issue的方式告诉我,我会删除相关信息。同时,使用本项目写论文或做其它学术研究的朋友,如果想把自己的研究成果展示在下面,也可以通过邮件或issue的方式告诉我。 From 84411d15ebf6bfc563653c18576650b42c559a3e Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Mon, 8 Mar 2021 19:43:11 +0800 Subject: [PATCH 310/363] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 00ce520e..f247931a 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,7 @@ - [如何获取cookie](#如何获取cookie) - [如何获取user_id](#如何获取user_id) - [常见问题](#常见问题) + - [学术研究](#学术研究) - [相关项目](#相关项目) - [贡献](#贡献) - [贡献者](#贡献者) @@ -230,7 +231,9 @@ $ python3 -m weibo_spider --u="1669879400,1223178222" ## 常见问题 如果运行程序的过程中出现错误,可以查看[常见问题](https://github.com/dataabc/weiboSpider/blob/master/docs/FAQ.md)页面,里面包含了最常见的问题及解决方法。如果出现的错误不在常见问题里,您可以通过[发issue](https://github.com/dataabc/weiboSpider/issues/new/choose)寻求帮助,我们会很乐意为您解答。 +## 学术研究 +本项目通过获取微博数据,为写论文、做研究等非商业项目提供所需数据。[学术研究文档](https://github.com/dataabc/weiboSpider/blob/master/docs/academic.md)是一些在论文或研究等方面使用过本程序的项目,这些项目展示已征得所有者同意。在一些涉及隐私的描述上,已与所有者做了沟通,描述中只介绍所有者允许展示的部分。如果部分信息所有者之前同意展示并且已经写在了文档中,现在又不想展示了,可以通过邮件(chillychen1991@gmail.com)或issue的方式告诉我,我会删除相关信息。同时,也欢迎使用本项目写论文或做其它学术研究的朋友,将自己的研究成果展示在[学术研究文档](https://github.com/dataabc/weiboSpider/blob/master/docs/academic.md)里,这完全是自愿的。 ## 相关项目 - [weibo-crawler](https://github.com/dataabc/weibo-crawler) - 功能和本项目完全一样,可以不添加cookie,获取的微博属性更多; From 4ee03323931aea23448bf8c39130b3905bdaf50c Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Sat, 13 Mar 2021 20:57:28 +0800 Subject: [PATCH 311/363] Update academic.md --- docs/academic.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/academic.md b/docs/academic.md index 5da6b3f2..ec2ced91 100644 --- a/docs/academic.md +++ b/docs/academic.md @@ -2,3 +2,7 @@ 本项目通过获取微博数据,为写论文、做研究等非商业项目提供所需数据。下面是一些在论文或研究等方面使用过本程序的项目。在一些涉及隐私的描述上,已与研究者做了沟通,在下面的描述中只介绍研究者 允许展示的部分。如果部分信息研究者之前同意展示并且已经写在了本文档中,现在又不想展示了,可以通过邮件(chillychen1991@gmail.com)或issue的方式告诉我,我会删除相关信息。同时,使用本项目写论文或做其它学术研究的朋友,如果想把自己的研究成果展示在下面,也可以通过邮件或issue的方式告诉我。 + +*** + +英国伦敦国王学院[Mak-LokGay](https://github.com/Mak-LokGay)的[毕业论文](https://github.com/Mak-LokGay/KCL_Dissertation) From 135d698211df4e1a24c6b4064f071f714a75abe1 Mon Sep 17 00:00:00 2001 From: dataabc Date: Wed, 17 Mar 2021 19:27:21 +0800 Subject: [PATCH 312/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E8=BF=9E?= =?UTF-8?q?=E7=BB=AD=E8=8E=B7=E5=8F=96=E7=A9=BA=E9=A1=B5=E9=9D=A2=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #313 --- setup.py | 2 +- weibo_spider/parser/page_parser.py | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 2f343744..b70e4331 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='weibo-spider', - version='0.1.9', + version='0.2.0', author='Chen Lei', author_email='chillychen1991@gmail.com', description='新浪微博爬虫,用python爬取新浪微博数据。', diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index 6bcc32a6..1e2fd9d2 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -35,7 +35,17 @@ def __init__(self, cookie, user_config, page, filter): endtime = ''.join(end_date) self.url = 'https://weibo.cn/%s/profile?starttime=%s&endtime=%s&advancedfilter=1&page=%d' % ( self.user_uri, starttime, endtime, page) - self.selector = handle_html(self.cookie, self.url) + self.selector = '' + self.to_continue = True + is_exist = '' + for i in range(3): + self.selector = handle_html(self.cookie, self.url) + info = self.selector.xpath("//div[@class='c']") + is_exist = info[0].xpath("div/span[@class='ctt']") + if is_exist: + break + if not is_exist: + self.to_continue = False self.filter = filter def get_one_page(self, weibo_id_list): @@ -63,7 +73,7 @@ def get_one_page(self, weibo_id_list): logger.info('-' * 100) weibos.append(weibo) weibo_id_list.append(weibo.id) - return weibos, weibo_id_list, True + return weibos, weibo_id_list, self.to_continue except Exception as e: logger.exception(e) From 885069e23aaf46433d08c91cdb45f61248104917 Mon Sep 17 00:00:00 2001 From: dataabc Date: Fri, 19 Mar 2021 19:26:37 +0800 Subject: [PATCH 313/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E9=83=A8?= =?UTF-8?q?=E5=88=86=E5=BE=AE=E5=8D=9A=E6=97=A0=E6=B3=95=E5=86=99=E5=85=A5?= =?UTF-8?q?mysql=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #317 --- setup.py | 2 +- weibo_spider/parser/util.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b70e4331..6b37bc28 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='weibo-spider', - version='0.2.0', + version='0.2.1', author='Chen Lei', author_email='chillychen1991@gmail.com', description='新浪微博爬虫,用python爬取新浪微博数据。', diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py index 52e39712..1a5f62cd 100644 --- a/weibo_spider/parser/util.py +++ b/weibo_spider/parser/util.py @@ -53,6 +53,7 @@ def handle_garbled(info): return info except Exception as e: logger.exception(e) + return u'无' def bid2mid(bid): From 448e40844766841e2fbc1b1d9748ab6e9f2e0e98 Mon Sep 17 00:00:00 2001 From: dataabc Date: Fri, 26 Mar 2021 21:39:29 +0800 Subject: [PATCH 314/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E4=BD=BF?= =?UTF-8?q?=E7=94=A8kafka=E6=97=B6=E9=83=A8=E5=88=86=E5=86=85=E5=AE=B9?= =?UTF-8?q?=E4=B8=BA=E7=A9=BA=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setup.py | 2 +- weibo_spider/writer/kafka_writer.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6b37bc28..1b88a456 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='weibo-spider', - version='0.2.1', + version='0.2.2', author='Chen Lei', author_email='chillychen1991@gmail.com', description='新浪微博爬虫,用python爬取新浪微博数据。', diff --git a/weibo_spider/writer/kafka_writer.py b/weibo_spider/writer/kafka_writer.py index 64ef4749..247fd3a2 100644 --- a/weibo_spider/writer/kafka_writer.py +++ b/weibo_spider/writer/kafka_writer.py @@ -27,10 +27,13 @@ def __init__(self, kafka_config): def write_weibo(self, weibo): for w in weibo: + w.user_id = self.user.id for topic in self.weibo_topics: self.producer.send(topic, value=w.__dict__) def write_user(self, user): + self.user = user + for topic in self.user_topics: self.producer.send(topic, value=user.__dict__) From 325abbc12601ca530cd04e11a89970e76a6dad92 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 26 Mar 2021 13:40:11 +0000 Subject: [PATCH 315/363] build(deps): bump lxml from 4.6.2 to 4.6.3 Bumps [lxml](https://github.com/lxml/lxml) from 4.6.2 to 4.6.3. - [Release notes](https://github.com/lxml/lxml/releases) - [Changelog](https://github.com/lxml/lxml/blob/master/CHANGES.txt) - [Commits](https://github.com/lxml/lxml/compare/lxml-4.6.2...lxml-4.6.3) Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index fde7f1f8..59c9e88f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -lxml==4.6.2 +lxml==4.6.3 requests==2.23.0 tqdm==4.46.1 absl-py==0.9.0 \ No newline at end of file From c12830dea3917510c1614fda58cfa3ef34ba1dd1 Mon Sep 17 00:00:00 2001 From: schaepher Date: Sun, 28 Mar 2021 17:03:45 +0800 Subject: [PATCH 316/363] =?UTF-8?q?fix(test=20data):=20=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?=20Windows=20=E7=B3=BB=E7=BB=9F=E4=B8=8B=E5=86=99=E5=85=A5=20Te?= =?UTF-8?q?st=20Data=20=E6=96=87=E4=BB=B6=E6=8A=A5=E9=94=99=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Windows 在配置 GENERATE_TEST_DATA 为 True 时执行程序会报错: 'gbk' codec can't encode character 'xxxx' in position xxxxxx: illegal multibyte sequence 原因是在 Windows 系统上,程序会将内容从 utf-8 编码转换为 gbk 编码,再写入文件。编码转换过程出现错误。 --- weibo_spider/parser/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py index 1a5f62cd..02b4d33e 100644 --- a/weibo_spider/parser/util.py +++ b/weibo_spider/parser/util.py @@ -29,7 +29,7 @@ def handle_html(cookie, url): import os resp_file = os.path.join(TEST_DATA_DIR, '%s.html' % hash_url(url)) - with io.open(resp_file, 'w') as f: + with io.open(resp_file, 'w', encoding='utf-8') as f: f.write(resp.text) with io.open(os.path.join(TEST_DATA_DIR, URL_MAP_FILE), 'r+') as f: From b2573315f57326c1dd334f7d4f52b2df17639d26 Mon Sep 17 00:00:00 2001 From: schaepher Date: Sun, 28 Mar 2021 17:17:56 +0800 Subject: [PATCH 317/363] =?UTF-8?q?docs(faq):=20=E8=8E=B7=E5=8F=96?= =?UTF-8?q?=E8=87=AA=E5=B7=B1=E7=9A=84=E5=BE=AE=E5=8D=9A=E4=B8=8D=E9=9C=80?= =?UTF-8?q?=E8=A6=81=E4=BF=AE=E6=94=B9=20info=5Fparser.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 使用 https://weibo.cn/%s/info 就可以获取。 如果使用 https://weibo.cn/%s/profile 会进入自己发送和转发的微博列表,程序无法获取到数据,导致出现以下错误: list index out of range ..... 'NoneType' object has no attribute 'nickname' --- docs/FAQ.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index bdefb3d5..107872dd 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -28,11 +28,7 @@ python3 -m weibo_spider 请使用[weibo-follow](https://github.com/dataabc/weibo-follow)。该程序可以利用一个user_id,获取该user_id微博用户关注人的user_id,一个user_id最多可以获得200个user_id,并写入user_id_list.txt文件。程序支持读文件,利用这200个user_id,可以获得最多200X200=40000个user_id。再利用这40000个user_id可以得到40000X200=8000000个user_id,如此反复,以此类推,可以获得大量user_id。本项目也支持读文件,将上述程序的结果文件user_id_list.txt路径赋值给本项目config.json的user_id_list参数,就可以获得这些user_id用户所发布的大量微博。 ### 7.如何获取自己的微博? -修改info_parser.py和page_parser.py中__init__方法,将前者的self.url修改为: -``` - self.url = "https://weibo.cn/%s/profile" % (user_id) -``` -后者的self.url修改为: +修改page_parser.py中__init__方法,将self.url修改为: ``` self.url = "https://weibo.cn/%s/profile?page=%d" % (user_uri, page) ``` From ffcb452a028b91780da0d86eb446651166d8fc56 Mon Sep 17 00:00:00 2001 From: schaepher Date: Sun, 28 Mar 2021 17:31:23 +0800 Subject: [PATCH 318/363] =?UTF-8?q?chore(gitignore):=20=E5=BF=BD=E7=95=A5?= =?UTF-8?q?=E4=BD=BF=E7=94=A8=20SQLite=20=E6=97=B6=E7=94=9F=E6=88=90?= =?UTF-8?q?=E7=9A=84=20weibo.db?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 默认配置文件将 SQLite 数据库文件名称配置为 weibo.db。应该忽略程序根据默认配置生成的文件。 --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 77108998..d4593149 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ dist/ config.json weibo/ +weibo.db *.log .idea From 3d7eca5564698b224dc432bd03d38b0218e790ca Mon Sep 17 00:00:00 2001 From: schaepher Date: Sun, 28 Mar 2021 17:36:59 +0800 Subject: [PATCH 319/363] =?UTF-8?q?feat(downloader):=20=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E4=B8=8B=E8=BD=BD=E5=8E=9F=E5=BE=AE=E5=8D=9A=E5=9B=BE=E7=89=87?= =?UTF-8?q?=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/downloader/__init__.py | 5 +++-- weibo_spider/downloader/img_downloader.py | 2 +- weibo_spider/downloader/origin_picture_downloader.py | 9 +++++++++ weibo_spider/downloader/retweet_picture_downloader.py | 9 +++++++++ weibo_spider/spider.py | 10 +++++++--- 5 files changed, 29 insertions(+), 6 deletions(-) create mode 100644 weibo_spider/downloader/origin_picture_downloader.py create mode 100644 weibo_spider/downloader/retweet_picture_downloader.py diff --git a/weibo_spider/downloader/__init__.py b/weibo_spider/downloader/__init__.py index 9d8c4f0d..d573f9f6 100644 --- a/weibo_spider/downloader/__init__.py +++ b/weibo_spider/downloader/__init__.py @@ -1,4 +1,5 @@ -from .img_downloader import ImgDownloader +from .origin_picture_downloader import OriginPictureDownloader +from .retweet_picture_downloader import RetweetPictureDownloader from .video_downloader import VideoDownloader -__all__ = [ImgDownloader, VideoDownloader] +__all__ = [OriginPictureDownloader, RetweetPictureDownloader, VideoDownloader] diff --git a/weibo_spider/downloader/img_downloader.py b/weibo_spider/downloader/img_downloader.py index 7655425c..95ccad77 100644 --- a/weibo_spider/downloader/img_downloader.py +++ b/weibo_spider/downloader/img_downloader.py @@ -7,7 +7,7 @@ class ImgDownloader(Downloader): def __init__(self, file_dir, file_download_timeout): super().__init__(file_dir, file_download_timeout) self.describe = u'图片' - self.key = 'original_pictures' + self.key = '' def handle_download(self, urls, w): """处理下载相关操作""" diff --git a/weibo_spider/downloader/origin_picture_downloader.py b/weibo_spider/downloader/origin_picture_downloader.py new file mode 100644 index 00000000..ae225f59 --- /dev/null +++ b/weibo_spider/downloader/origin_picture_downloader.py @@ -0,0 +1,9 @@ +import os + +from .img_downloader import ImgDownloader + + +class OriginPictureDownloader(ImgDownloader): + def __init__(self, file_dir, file_download_timeout): + super().__init__(file_dir, file_download_timeout) + self.key = 'original_pictures' diff --git a/weibo_spider/downloader/retweet_picture_downloader.py b/weibo_spider/downloader/retweet_picture_downloader.py new file mode 100644 index 00000000..f4f6b463 --- /dev/null +++ b/weibo_spider/downloader/retweet_picture_downloader.py @@ -0,0 +1,9 @@ +import os + +from .img_downloader import ImgDownloader + + +class RetweetPictureDownloader(ImgDownloader): + def __init__(self, file_dir, file_download_timeout): + super().__init__(file_dir, file_download_timeout) + self.key = 'retweet_pictures' diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index d5a17dae..365e169d 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -274,11 +274,15 @@ def initialize_info(self, user_config): self.downloaders = [] if self.pic_download == 1: - from .downloader import ImgDownloader + from .downloader import OriginPictureDownloader, RetweetPictureDownloader self.downloaders.append( - ImgDownloader(self._get_filepath('img'), - self.file_download_timeout)) + OriginPictureDownloader(self._get_filepath('img'), + self.file_download_timeout)) + + self.downloaders.append( + RetweetPictureDownloader(self._get_filepath('img'), + self.file_download_timeout)) if self.video_download == 1: from .downloader import VideoDownloader From 358da5a65de8f589a0a7c12ac756762870ace24c Mon Sep 17 00:00:00 2001 From: schaepher Date: Sun, 28 Mar 2021 21:28:33 +0800 Subject: [PATCH 320/363] =?UTF-8?q?fix(page=5Fparser):=20=E4=BF=AE?= =?UTF-8?q?=E5=A4=8D=E5=9C=A8=E7=88=AC=E5=8F=96=E5=85=A8=E9=83=A8=E5=BE=AE?= =?UTF-8?q?=E5=8D=9A=E7=9A=84=E6=97=B6=E5=80=99=E5=BF=BD=E7=95=A5=E8=A7=86?= =?UTF-8?q?=E9=A2=91=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/parser/page_parser.py | 64 +++++++++++++++--------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index 1e2fd9d2..b6f87007 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -266,39 +266,38 @@ def get_picture_urls(self, info, is_original): except Exception as e: logger.exception(e) - def get_video_url(self, info, is_original): + def get_video_url(self, info): """获取微博视频url""" try: video_url = u'无' - if is_original: - div_first = info.xpath('div')[0] - a_list = div_first.xpath('.//a') - video_link = u'无' - for a in a_list: - if 'm.weibo.cn/s/video/show?object_id=' in a.xpath( - '@href')[0]: - video_link = a.xpath('@href')[0] - break - if video_link != u'无': - video_link = video_link.replace( - 'm.weibo.cn/s/video/show', 'm.weibo.cn/s/video/object') - try: - user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' - headers = { - 'User_Agent': user_agent, - 'Cookie': self.cookie - } - wb_info = requests.get(video_link, - headers=headers).json() - video_url = wb_info['data']['object']['stream'].get( - 'hd_url') - if not video_url: - video_url = wb_info['data']['object']['stream'][ - 'url'] - if not video_url: # 说明该视频为直播 - video_url = u'无' - except json.decoder.JSONDecodeError: - logger.warning(u'当前账号没有浏览该视频的权限') + div_first = info.xpath('div')[0] + a_list = div_first.xpath('.//a') + video_link = u'无' + for a in a_list: + if 'm.weibo.cn/s/video/show?object_id=' in a.xpath( + '@href')[0]: + video_link = a.xpath('@href')[0] + break + if video_link != u'无': + video_link = video_link.replace( + 'm.weibo.cn/s/video/show', 'm.weibo.cn/s/video/object') + try: + user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' + headers = { + 'User_Agent': user_agent, + 'Cookie': self.cookie + } + wb_info = requests.get(video_link, + headers=headers).json() + video_url = wb_info['data']['object']['stream'].get( + 'hd_url') + if not video_url: + video_url = wb_info['data']['object']['stream'][ + 'url'] + if not video_url: # 说明该视频为直播 + video_url = u'无' + except json.decoder.JSONDecodeError: + logger.warning(u'当前账号没有浏览该视频的权限') return video_url except Exception as e: logger.exception(e) @@ -317,6 +316,7 @@ def get_one_weibo(self, info): try: weibo = Weibo() is_original = self.is_original(info) + weibo.original = is_original # 是否原创微博 if (not self.filter) or is_original: weibo.id = info.xpath('@id')[0][2:] weibo.content = self.get_weibo_content(info, @@ -328,9 +328,7 @@ def get_one_weibo(self, info): if not self.filter: weibo.retweet_pictures = picture_urls[ 'retweet_pictures'] # 转发图片url - weibo.original = is_original # 是否原创微博 - weibo.video_url = self.get_video_url(info, - is_original) # 微博视频url + weibo.video_url = self.get_video_url(info) # 微博视频url weibo.publish_place = self.get_publish_place(info) # 微博发布位置 weibo.publish_time = self.get_publish_time(info) # 微博发布时间 weibo.publish_tool = self.get_publish_tool(info) # 微博发布工具 From c8047f8b2f301c2bda8a3be52f0f7efab1485e2e Mon Sep 17 00:00:00 2001 From: schaepher Date: Sun, 28 Mar 2021 21:49:42 +0800 Subject: [PATCH 321/363] =?UTF-8?q?fix(downloader):=20=E5=88=A0=E9=99=A4?= =?UTF-8?q?=E4=B8=8B=E8=BD=BD=E6=96=87=E4=BB=B6=E5=90=8D=E4=B8=AD=E5=A4=9A?= =?UTF-8?q?=E4=BD=99=E7=9A=84=E7=A9=BA=E6=A0=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "2021-03-28" 共 10 个字符,取 0~9 就行 --- weibo_spider/downloader/img_downloader.py | 2 +- weibo_spider/downloader/video_downloader.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/weibo_spider/downloader/img_downloader.py b/weibo_spider/downloader/img_downloader.py index 95ccad77..adea0e09 100644 --- a/weibo_spider/downloader/img_downloader.py +++ b/weibo_spider/downloader/img_downloader.py @@ -11,7 +11,7 @@ def __init__(self, file_dir, file_download_timeout): def handle_download(self, urls, w): """处理下载相关操作""" - file_prefix = w.publish_time[:11].replace('-', '') + '_' + w.id + file_prefix = w.publish_time[:10].replace('-', '') + '_' + w.id if ',' in urls: url_list = urls.split(',') for i, url in enumerate(url_list): diff --git a/weibo_spider/downloader/video_downloader.py b/weibo_spider/downloader/video_downloader.py index 3e9953de..ba6a8f39 100644 --- a/weibo_spider/downloader/video_downloader.py +++ b/weibo_spider/downloader/video_downloader.py @@ -11,7 +11,7 @@ def __init__(self, file_dir, file_download_timeout): def handle_download(self, urls, w): """处理下载相关操作""" - file_prefix = w.publish_time[:11].replace('-', '') + '_' + w.id + file_prefix = w.publish_time[:10].replace('-', '') + '_' + w.id file_suffix = '.mp4' file_name = file_prefix + file_suffix file_path = self.file_dir + os.sep + file_name From 7cb0b2f8f862e5fa75030beba954c66cb666afb9 Mon Sep 17 00:00:00 2001 From: schaepher Date: Sun, 28 Mar 2021 22:02:06 +0800 Subject: [PATCH 322/363] =?UTF-8?q?feat(page=5Fparser):=20=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E8=8E=B7=E5=8F=96=E9=95=BF=E5=BE=AE=E5=8D=9A=E8=A7=86?= =?UTF-8?q?=E9=A2=91=E9=93=BE=E6=8E=A5=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/parser/comment_parser.py | 18 +++++++++ weibo_spider/parser/page_parser.py | 53 +++++++++++---------------- weibo_spider/parser/util.py | 29 ++++++++++++++- 3 files changed, 67 insertions(+), 33 deletions(-) diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/parser/comment_parser.py index 8e10fa3c..acc4cb2e 100644 --- a/weibo_spider/parser/comment_parser.py +++ b/weibo_spider/parser/comment_parser.py @@ -1,5 +1,6 @@ import logging import random +import requests from time import sleep from .parser import Parser @@ -39,3 +40,20 @@ def get_long_retweet(self): return weibo_content except Exception as e: logger.exception(e) + + def get_video_page_url(self): + """获取微博视频页面的链接""" + video_url = '' + try: + self.selector = handle_html(self.cookie, self.url) + if self.selector is not None: + links = self.selector.xpath("body/div[@class='c' and @id][1]/div/span/a") + for a in links: + if 'm.weibo.cn/s/video/show?object_id=' in a.xpath( + '@href')[0]: + video_url = a.xpath('@href')[0] + break + except Exception: + logger.exception(u'网络出错') + + return video_url diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index b6f87007..dd31bbe0 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -11,7 +11,7 @@ from .comment_parser import CommentParser from .mblog_picAll_parser import MblogPicAllParser from .parser import Parser -from .util import handle_garbled, handle_html +from .util import handle_garbled, handle_html, to_video_download_url logger = logging.getLogger('spider.page_parser') @@ -268,40 +268,29 @@ def get_picture_urls(self, info, is_original): def get_video_url(self, info): """获取微博视频url""" + video_url = u'无' + + weibo_id = info.xpath('@id')[0][2:] try: - video_url = u'无' - div_first = info.xpath('div')[0] - a_list = div_first.xpath('.//a') - video_link = u'无' - for a in a_list: - if 'm.weibo.cn/s/video/show?object_id=' in a.xpath( - '@href')[0]: - video_link = a.xpath('@href')[0] - break - if video_link != u'无': - video_link = video_link.replace( - 'm.weibo.cn/s/video/show', 'm.weibo.cn/s/video/object') - try: - user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' - headers = { - 'User_Agent': user_agent, - 'Cookie': self.cookie - } - wb_info = requests.get(video_link, - headers=headers).json() - video_url = wb_info['data']['object']['stream'].get( - 'hd_url') - if not video_url: - video_url = wb_info['data']['object']['stream'][ - 'url'] - if not video_url: # 说明该视频为直播 - video_url = u'无' - except json.decoder.JSONDecodeError: - logger.warning(u'当前账号没有浏览该视频的权限') - return video_url + video_page_url = '' + a_text = info.xpath('div[1]/span[@class="ctt"]/a/text()') + if u'全文' in a_text: + video_page_url = CommentParser(self.cookie, + weibo_id).get_video_page_url() + else: + a_list = info.xpath('div[1]/span[@class="ctt"]/a') + for a in a_list: + if 'm.weibo.cn/s/video/show?object_id=' in a.xpath( + '@href')[0]: + video_page_url = a.xpath('@href')[0] + break + + if video_page_url != '': + video_url = to_video_download_url(self.cookie, video_page_url) except Exception as e: logger.exception(e) - return u'无' + + return video_url def is_pinned_weibo(self, info): """判断微博是否为置顶微博""" diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py index 02b4d33e..a763e96a 100644 --- a/weibo_spider/parser/util.py +++ b/weibo_spider/parser/util.py @@ -1,6 +1,7 @@ import hashlib import logging import sys +import json import requests from lxml import etree @@ -25,7 +26,6 @@ def handle_html(cookie, url): if GENERATE_TEST_DATA: import io - import json import os resp_file = os.path.join(TEST_DATA_DIR, '%s.html' % hash_url(url)) @@ -81,3 +81,30 @@ def bid2mid(bid): strnum = '0' + strnum mid += strnum return mid + + +def to_video_download_url(cookie, video_page_url): + if video_page_url == '': + return '' + + video_object_url = video_page_url.replace( + 'm.weibo.cn/s/video/show', 'm.weibo.cn/s/video/object') + try: + user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' + headers = { + 'User_Agent': user_agent, + 'Cookie': cookie + } + wb_info = requests.get(video_object_url, + headers=headers).json() + video_url = wb_info['data']['object']['stream'].get( + 'hd_url') + if not video_url: + video_url = wb_info['data']['object']['stream'][ + 'url'] + if not video_url: # 说明该视频为直播 + video_url = '' + except json.decoder.JSONDecodeError: + logger.warning(u'当前账号没有浏览该视频的权限') + + return video_url From 85333bdd05c7a66d220dc2fd7d1b65b559f9f9d1 Mon Sep 17 00:00:00 2001 From: schaepher Date: Tue, 30 Mar 2021 20:43:57 +0800 Subject: [PATCH 323/363] =?UTF-8?q?fix(video=20url):=20=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?=E4=B8=A2=E5=A4=B1=E8=A7=86=E9=A2=91=E5=8F=B7=E5=BE=AE=E5=8D=9A?= =?UTF-8?q?=E8=A7=86=E9=A2=91=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/parser/comment_parser.py | 3 ++- weibo_spider/parser/page_parser.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/parser/comment_parser.py index acc4cb2e..28e75e30 100644 --- a/weibo_spider/parser/comment_parser.py +++ b/weibo_spider/parser/comment_parser.py @@ -47,7 +47,8 @@ def get_video_page_url(self): try: self.selector = handle_html(self.cookie, self.url) if self.selector is not None: - links = self.selector.xpath("body/div[@class='c' and @id][1]/div/span/a") + # 来自微博视频号的格式与普通格式不一致,不加 span 层级 + links = self.selector.xpath("body/div[@class='c' and @id][1]/div//a") for a in links: if 'm.weibo.cn/s/video/show?object_id=' in a.xpath( '@href')[0]: diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index dd31bbe0..50eca4d7 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -273,12 +273,13 @@ def get_video_url(self, info): weibo_id = info.xpath('@id')[0][2:] try: video_page_url = '' - a_text = info.xpath('div[1]/span[@class="ctt"]/a/text()') + a_text = info.xpath('./div[1]//a/text()') if u'全文' in a_text: video_page_url = CommentParser(self.cookie, weibo_id).get_video_page_url() else: - a_list = info.xpath('div[1]/span[@class="ctt"]/a') + # 来自微博视频号的格式与普通格式不一致,不加 span 层级 + a_list = info.xpath('./div[1]//a') for a in a_list: if 'm.weibo.cn/s/video/show?object_id=' in a.xpath( '@href')[0]: From d90e35a1e5e820e6c4ec1a68c338aabe44c93882 Mon Sep 17 00:00:00 2001 From: Chen Lei Date: Thu, 1 Apr 2021 14:12:02 +0800 Subject: [PATCH 324/363] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f247931a..9e35bf3c 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,9 @@ 同时支持下载微博中的图片和视频,具体的可下载文件如下: - **原创**微博中的原始**图片**(可选) -- **转发**微博中的原始**图片**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) +- **转发**微博中的原始**图片**(可选) - **原创**微博中的**视频**(可选) -- **转发**微博中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) +- **转发**微博中的**视频**(可选) - **原创**微博**Live Photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) - **转发**微博**Live Photo**中的**视频**([免cookie版](https://github.com/dataabc/weibo-crawler)特有) From 4e195f5418589e4307cf23aba00065e9f974b41a Mon Sep 17 00:00:00 2001 From: dataabc Date: Thu, 1 Apr 2021 19:56:48 +0800 Subject: [PATCH 325/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E5=9B=BE?= =?UTF-8?q?=E7=89=87=E5=AD=98=E5=82=A8=E7=9B=AE=E5=BD=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #329 --- setup.py | 2 +- weibo_spider/downloader/img_downloader.py | 7 +++++-- weibo_spider/downloader/origin_picture_downloader.py | 3 +-- weibo_spider/downloader/retweet_picture_downloader.py | 3 +-- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 1b88a456..43ab90ba 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='weibo-spider', - version='0.2.2', + version='0.2.4', author='Chen Lei', author_email='chillychen1991@gmail.com', description='新浪微博爬虫,用python爬取新浪微博数据。', diff --git a/weibo_spider/downloader/img_downloader.py b/weibo_spider/downloader/img_downloader.py index adea0e09..f87ca2ae 100644 --- a/weibo_spider/downloader/img_downloader.py +++ b/weibo_spider/downloader/img_downloader.py @@ -12,6 +12,9 @@ def __init__(self, file_dir, file_download_timeout): def handle_download(self, urls, w): """处理下载相关操作""" file_prefix = w.publish_time[:10].replace('-', '') + '_' + w.id + file_dir = self.file_dir + os.sep + self.describe + if not os.path.isdir(file_dir): + os.makedirs(file_dir) if ',' in urls: url_list = urls.split(',') for i, url in enumerate(url_list): @@ -21,7 +24,7 @@ def handle_download(self, urls, w): else: file_suffix = url[index:] file_name = file_prefix + '_' + str(i + 1) + file_suffix - file_path = self.file_dir + os.sep + file_name + file_path = file_dir + os.sep + file_name self.download_one_file(url, file_path, w.id) else: index = urls.rfind('.') @@ -30,5 +33,5 @@ def handle_download(self, urls, w): else: file_suffix = urls[index:] file_name = file_prefix + file_suffix - file_path = self.file_dir + os.sep + file_name + file_path = file_dir + os.sep + file_name self.download_one_file(urls, file_path, w.id) diff --git a/weibo_spider/downloader/origin_picture_downloader.py b/weibo_spider/downloader/origin_picture_downloader.py index ae225f59..1f4e76c7 100644 --- a/weibo_spider/downloader/origin_picture_downloader.py +++ b/weibo_spider/downloader/origin_picture_downloader.py @@ -1,9 +1,8 @@ -import os - from .img_downloader import ImgDownloader class OriginPictureDownloader(ImgDownloader): def __init__(self, file_dir, file_download_timeout): super().__init__(file_dir, file_download_timeout) + self.describe = u'原创微博图片' self.key = 'original_pictures' diff --git a/weibo_spider/downloader/retweet_picture_downloader.py b/weibo_spider/downloader/retweet_picture_downloader.py index f4f6b463..7ab39c5c 100644 --- a/weibo_spider/downloader/retweet_picture_downloader.py +++ b/weibo_spider/downloader/retweet_picture_downloader.py @@ -1,9 +1,8 @@ -import os - from .img_downloader import ImgDownloader class RetweetPictureDownloader(ImgDownloader): def __init__(self, file_dir, file_download_timeout): super().__init__(file_dir, file_download_timeout) + self.describe = u'转发微博图片' self.key = 'retweet_pictures' From 9db54e1ad52540732cfd5e03bd7513b117374839 Mon Sep 17 00:00:00 2001 From: dataabc Date: Thu, 1 Apr 2021 20:12:16 +0800 Subject: [PATCH 326/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E5=9B=BE?= =?UTF-8?q?=E7=89=87=E4=B8=8B=E8=BD=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/spider.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 365e169d..974ee998 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -274,12 +274,13 @@ def initialize_info(self, user_config): self.downloaders = [] if self.pic_download == 1: - from .downloader import OriginPictureDownloader, RetweetPictureDownloader + from .downloader import (OriginPictureDownloader, + RetweetPictureDownloader) self.downloaders.append( OriginPictureDownloader(self._get_filepath('img'), self.file_download_timeout)) - + if self.pic_download and not self.filter: self.downloaders.append( RetweetPictureDownloader(self._get_filepath('img'), self.file_download_timeout)) From 2853c4261cd478c7e027f5ab219eb47478367453 Mon Sep 17 00:00:00 2001 From: dataabc Date: Thu, 15 Apr 2021 20:47:04 +0800 Subject: [PATCH 327/363] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E7=88=AC?= =?UTF-8?q?=E5=8F=96=E5=81=9C=E6=AD=A2=E6=B5=81=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setup.py | 2 +- weibo_spider/parser/page_parser.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 43ab90ba..ba026a42 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='weibo-spider', - version='0.2.4', + version='0.2.5', author='Chen Lei', author_email='chillychen1991@gmail.com', description='新浪微博爬虫,用python爬取新浪微博数据。', diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index 50eca4d7..e05335ef 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -1,11 +1,8 @@ -import json import logging import re import sys from datetime import datetime, timedelta -import requests - from .. import datetime_util from ..weibo import Weibo from .comment_parser import CommentParser @@ -17,8 +14,13 @@ class PageParser(Parser): + empty_count = 0 + def __init__(self, cookie, user_config, page, filter): self.cookie = cookie + if hasattr(PageParser, + 'user_uri') and self.user_uri != user_config['user_uri']: + PageParser.empty_count = 0 self.user_uri = user_config['user_uri'] self.since_date = user_config['since_date'] self.end_date = user_config['end_date'] @@ -43,9 +45,13 @@ def __init__(self, cookie, user_config, page, filter): info = self.selector.xpath("//div[@class='c']") is_exist = info[0].xpath("div/span[@class='ctt']") if is_exist: + PageParser.empty_count = 0 break if not is_exist: + PageParser.empty_count += 1 + if PageParser.empty_count > 2: self.to_continue = False + PageParser.empty_count = 0 self.filter = filter def get_one_page(self, weibo_id_list): From 942ce80bde9e95ddf9a79568e0e574bcf138ba6c Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Sat, 5 Jun 2021 23:28:15 +0800 Subject: [PATCH 328/363] Fix markdown style. Mainly based on [1]. 1.
are replaced by blank lines. 2. Add lexers [2] after code fence. 3. Add one space after ordered list numbers. [1] https://github.com/DavidAnson/markdownlint/blob/v0.23.1/doc/Rules.md [2] https://pygments.org/docs/lexers/ --- .github/ISSUE_TEMPLATE/bug-report.md | 20 +- .github/ISSUE_TEMPLATE/failed.md | 20 +- .github/ISSUE_TEMPLATE/feature-request.md | 7 +- CONTRIBUTING.md | 18 +- README.md | 25 ++- docs/FAQ.md | 33 ++-- docs/academic.md | 4 +- docs/automation.md | 46 +++-- docs/contributors.md | 8 +- docs/cookie.md | 19 +- docs/example.md | 77 +++++--- docs/settings.md | 216 ++++++++++++++-------- docs/userid.md | 21 ++- 13 files changed, 341 insertions(+), 173 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md index 38d1e3d6..05351bde 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.md +++ b/.github/ISSUE_TEMPLATE/bug-report.md @@ -7,24 +7,28 @@ assignees: '' --- -感谢您申报bug,为了表示感谢,如果bug确实存在,您将出现在本项目的贡献者列表里;如果您不但发现了bug,还提供了很好的解决方案,我们会邀请您以pull request的方式成为本项目的代码贡献者(Contributor);如果您多次提供很好的pull request,我们将邀请您成为本项目的协助者(Collaborator)。当然,是否提供解决方按都是自愿的。不管是否是真正的bug、是否提供解决方案,我们都感谢您对本项目的帮助。
-
+感谢您申报bug,为了表示感谢,如果bug确实存在,您将出现在本项目的贡献者列表里;如果您不但发现了bug,还提供了很好的解决方案,我们会邀请您以pull request的方式成为本项目的代码贡献者(Contributor);如果您多次提供很好的pull request,我们将邀请您成为本项目的协助者(Collaborator)。当然,是否提供解决方按都是自愿的。不管是否是真正的bug、是否提供解决方案,我们都感谢您对本项目的帮助。 +- 问:请您指明哪个版本出了bug(github版/PyPi版/全部)? -- 问:请您指明哪个版本出了bug(github版/PyPi版/全部)?
答: -- 问:您使用的是否是最新的程序(是/否)?
+- 问:您使用的是否是最新的程序(是/否)? + 答: -- 问:爬取任意用户都会复现此bug吗(是/否)?
+- 问:爬取任意用户都会复现此bug吗(是/否)? + 答: -- 问:若只有爬特定微博时才出bug,能否提供出错微博的weibo_id或url(非必填)?
+- 问:若只有爬特定微博时才出bug,能否提供出错微博的weibo_id或url(非必填)? + 答: -- 问:若您已提供出错微博的weibo_id或url,可忽略此内容,否则能否提供出错账号的**user_id**及您配置的**since_date**,方便我们定位出错微博(非必填)?
+- 问:若您已提供出错微博的weibo_id或url,可忽略此内容,否则能否提供出错账号的**user_id**及您配置的**since_date**,方便我们定位出错微博(非必填)? + 答: -- 问:如果方便,请您描述bug详情,如果代码报错,最好附上错误提示。
+- 问:如果方便,请您描述bug详情,如果代码报错,最好附上错误提示。 + 答: diff --git a/.github/ISSUE_TEMPLATE/failed.md b/.github/ISSUE_TEMPLATE/failed.md index 142407d5..dfd47ac6 100644 --- a/.github/ISSUE_TEMPLATE/failed.md +++ b/.github/ISSUE_TEMPLATE/failed.md @@ -7,22 +7,28 @@ assignees: '' --- -为了更好的解决问题,请认真回答下面的问题。等到问题解决,请及时关闭本issue。
+为了更好的解决问题,请认真回答下面的问题。等到问题解决,请及时关闭本issue。 + +- 问:请您指明哪个版本运行出错(github版/PyPi版/全部)? -- 问:请您指明哪个版本运行出错(github版/PyPi版/全部)?
答: -- 问:您使用的是否是最新的程序(是/否)?
+- 问:您使用的是否是最新的程序(是/否)? + 答: -- 问:爬取任意用户都会运行出错吗(是/否)?
+- 问:爬取任意用户都会运行出错吗(是/否)? + 答: -- 问:若只有爬特定微博时才出错,能否提供出错微博的weibo_id或url(非必填)?
+- 问:若只有爬特定微博时才出错,能否提供出错微博的weibo_id或url(非必填)? + 答: -- 问:若您已提供出错微博的weibo_id或url,可忽略此内容,否则能否提供出错账号的**user_id**及您配置的**since_date**,方便我们定位出错微博(非必填)?
+- 问:若您已提供出错微博的weibo_id或url,可忽略此内容,否则能否提供出错账号的**user_id**及您配置的**since_date**,方便我们定位出错微博(非必填)? + 答: -- 问:如果方便,请您描述出错详情,最好附上错误提示。
+- 问:如果方便,请您描述出错详情,最好附上错误提示。 + 答: diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md index 65870218..60f26098 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.md +++ b/.github/ISSUE_TEMPLATE/feature-request.md @@ -7,9 +7,10 @@ assignees: '' --- -- 问:请说明需要什么新功能。
-答: +- 问:请说明需要什么新功能。 -- 问:请说明添加该功能的意义。(非必填)
答: +- 问:请说明添加该功能的意义。(非必填) + +答: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a2c6f347..c2077636 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,22 +1,38 @@ # 为本项目做贡献 + 本项目使用**Python3**编写,感谢大家对项目的支持,也欢迎大家为开源项目做贡献。鉴于大家拥有不同的技能、经验、认知、时间等,每个人可以根据自身的情况为本项目贡献力量。我们不会因为贡献者写的代码少或者提的建议不好而失去感恩之心,每一个乐于奉献的人都值得并且应该被尊重。所以,如果您觉得自己的代码或建议不好,而不好意思去贡献,这样可能就让本项目失去了一次变得更好的机会。所以,如果您有好的想法、建议,或者发现了bug,欢迎通过issue提出来,这也是一种贡献方式。如果您想要为本项目贡献代码,我们也非常欢迎。最开始您可以通过pull request方式提交代码,如果我们发现您的代码质量非常高,或者非常有想法等,我们会邀请您请成为本项目的协作者([Collaborator](https://help.github.com/cn/github/setting-up-and-managing-your-github-user-account/permission-levels-for-a-user-account-repository#collaborator-access-on-a-repository-owned-by-a-user-account)),这样您就可以直接向本项目提交代码了。在您贡献代码之前,请先阅读下面的说明,这会让您更好的贡献代码。 ## 贡献代码之前 + 如果要开发新功能或者其它需要大量编写代码的修改,在开发之前最好发Issue说明一下。比如,“我准备开发xx新功能”或者“我想修改xx功能”之类的。因为要开发的功能不一定适合本项目,所以提前说明讨论,判断新功能或修改是否有必要。否则,费时费力写了很多代码,结果最后没有被采纳,可能会做一些无用功。 + ## Python风格规范(建议Python新手阅读) + 参考[Python风格规范](https://zh-google-styleguide.readthedocs.io/en/latest/google-python-styleguide/python_style_rules/) -或者[Python风格规范](https://github.com/zh-google-styleguide/zh-google-styleguide/blob/master/google-python-styleguide/python_style_rules.rst) +或者[Python风格规范](https://github.com/zh-google-styleguide/zh-google-styleguide/blob/master/google-python-styleguide/python_style_rules.rst), 二者内容是一样的。 + ## git提交规范 + 参考[Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) 或者[Git提交规范](https://zhuanlan.zhihu.com/p/67804026),commit描述中文英文皆可,只要符合规范就好。 + ## git提交建议(可选) + 本建议是可选的,如果你觉得不合理,可以按自己的方式编写代码。建议每次提交都是代码改动较少的提交,如果新功能需要大量修改代码,建议将新功能分成几个小模块,每个模块提交一次。原因是这样更容易管理代码。比如,一个新功能包含几个模块。其中大部分模块都写的很好,但是有一个模块有bug。分模块提交只需要单独处理出问题的模块,其他模块不受影响。 + ## Python之linter + 本项目使用flake8。 + ## Python之formatter + 本项目使用yapf。 + ## 引号的使用 + 代码中**建议使用单引号**,只有在特殊情况下使用双引号,如类、方法、函数等开头的注释使用6个双引号包裹(注释左边三个双引号,右边三个双引号),或者字符串中中已经包含单引号了,则要用双引号包裹。 + ## 避免过多的模块依赖 + 除非有必要,尽量少使用非内置的模块,因为会增加用户的安装成本,当然如果该模块能够为本项目或用户带来很多便利,则可以使用。 diff --git a/README.md b/README.md index 9e35bf3c..2cbbb543 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ - **SQLite数据库**(可选) 同时支持下载微博中的图片和视频,具体的可下载文件如下: + - **原创**微博中的原始**图片**(可选) - **转发**微博中的原始**图片**(可选) - **原创**微博中的**视频**(可选) @@ -25,6 +26,8 @@ ## 内容列表 +[TOC] + - [Weibo Spider](#weibo-spider) - [内容列表](#内容列表) - [获取到的字段](#获取到的字段) @@ -52,7 +55,7 @@ 本部分为爬取到的字段信息说明,为了与[免cookie版](https://github.com/dataabc/weibo-crawler)区分,下面将两者爬取到的信息都列出来。如果是免cookie版所特有的信息,会有免cookie标注,没有标注的为二者共有的信息。 -**用户信息** +### 用户信息 - 用户id:微博用户id,如"1669879400",其实这个字段本来就是已知字段 - 昵称:用户昵称,如"Dear-迪丽热巴" @@ -76,9 +79,7 @@ - 认证类型(免cookie版):用户认证类型,如个人认证、企业认证、政府认证等 - 认证信息:为认证用户特有,用户信息栏显示的认证信息 -*** - -**微博信息** +### 微博信息 - 微博id:微博唯一标志 - 微博内容:微博正文 @@ -153,13 +154,17 @@ $ python3 -m weibo_spider --config_path="config.json" ``` 如果你想指定文件(csv、txt、json、图片、视频)保存路径,可以通过output_dir参数设定。假如你想把文件保存到/home/weibo/目录,可以运行如下命令: -``` + +```bash $ python3 -m weibo_spider --output_dir="/home/weibo/" ``` + 如果你想通过命令行输入user_id,可以使用参数u,可以输入一个或多个user_id,每个user_id以英文逗号分开,如果这些user_id中有重复的user_id,程序会自动去重。命令行如下: -``` + +```bash $ python3 -m weibo_spider --u="1669879400,1223178222" ``` + 程序会获取user_id分别为1669879400和1223178222的微博用户的微博,后面会讲[如何获取user_id](#如何获取user_id)。该方式的所有user_id使用config.json中的since_date和end_date设置,通过修改它们的值可以控制爬取的时间范围。若config.json中的user_id_list是文件路径,每个命令行中的user_id都会自动保存到该文件内,且自动更新since_date;若不是路径,user_id会保存在当前目录的user_id_list.txt内,且自动更新since_date,若当前目录下不存在user_id_list.txt,程序会自动创建它。 ## 个性化定制程序(可选) @@ -192,6 +197,7 @@ $ python3 -m weibo_spider --u="1669879400,1223178222" - wb.user['weibo_num']:微博数; - wb.user['following']:关注数; - wb.user['followers']:粉丝数; +
**wb.weibo**:除不包含上述信息外,wb.weibo包含爬取到的所有微博信息,如**微博id**、**微博正文**、**原始图片url**、**发布位置**、**发布时间**、**发布工具**、**点赞数**、**转发数**、**评论数**等。如果爬的是全部微博(原创+转发),除上述信息之外,还包含被**转发微博原始图片url**、**是否为原创微博**等。wb.weibo是一个列表,包含了爬取的所有微博信息。wb.weibo[0]为爬取的第一条微博,wb.weibo[1]为爬取的第二条微博,以此类推。当filter=1时,wb.weibo[0]为爬取的第一条**原创**微博,以此类推。wb.weibo[0]['id']为第一条微博的id,wb.weibo[0]['content']为第一条微博的正文,wb.weibo[0]['publish_time']为第一条微博的发布时间,还有其它很多信息不在赘述,大家可以点击下面的"详情"查看具体用法。 @@ -231,9 +237,11 @@ $ python3 -m weibo_spider --u="1669879400,1223178222" ## 常见问题 如果运行程序的过程中出现错误,可以查看[常见问题](https://github.com/dataabc/weiboSpider/blob/master/docs/FAQ.md)页面,里面包含了最常见的问题及解决方法。如果出现的错误不在常见问题里,您可以通过[发issue](https://github.com/dataabc/weiboSpider/issues/new/choose)寻求帮助,我们会很乐意为您解答。 + ## 学术研究 本项目通过获取微博数据,为写论文、做研究等非商业项目提供所需数据。[学术研究文档](https://github.com/dataabc/weiboSpider/blob/master/docs/academic.md)是一些在论文或研究等方面使用过本程序的项目,这些项目展示已征得所有者同意。在一些涉及隐私的描述上,已与所有者做了沟通,描述中只介绍所有者允许展示的部分。如果部分信息所有者之前同意展示并且已经写在了文档中,现在又不想展示了,可以通过邮件(chillychen1991@gmail.com)或issue的方式告诉我,我会删除相关信息。同时,也欢迎使用本项目写论文或做其它学术研究的朋友,将自己的研究成果展示在[学术研究文档](https://github.com/dataabc/weiboSpider/blob/master/docs/academic.md)里,这完全是自愿的。 + ## 相关项目 - [weibo-crawler](https://github.com/dataabc/weibo-crawler) - 功能和本项目完全一样,可以不添加cookie,获取的微博属性更多; @@ -249,6 +257,5 @@ $ python3 -m weibo_spider --u="1669879400,1223178222" ## 注意事项 -1.user_id不能为爬虫微博的user_id。因为要爬微博信息,必须先登录到某个微博账号,此账号我们姑且称为爬虫微博。爬虫微博访问自己的页面和访问其他用户的页面,得到的网页格式不同,所以无法爬取自己的微博信息;如果想要爬取爬虫微博内容,可以参考[获取自身微博信息](https://github.com/dataabc/weiboSpider/issues/113); - -2.cookie有期限限制,大约三个月。若提示cookie错误或已过期,需要重新更新cookie。 +1. user_id不能为爬虫微博的user_id。因为要爬微博信息,必须先登录到某个微博账号,此账号我们姑且称为爬虫微博。爬虫微博访问自己的页面和访问其他用户的页面,得到的网页格式不同,所以无法爬取自己的微博信息;如果想要爬取爬虫微博内容,可以参考[获取自身微博信息](https://github.com/dataabc/weiboSpider/issues/113); +2. cookie有期限限制,大约三个月。若提示cookie错误或已过期,需要重新更新cookie。 diff --git a/docs/FAQ.md b/docs/FAQ.md index 107872dd..c25a1938 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -1,34 +1,45 @@ -## 常见问题 +# 常见问题 + +## 1. 程序运行出错,错误提示中包含“ImportError: cannot import name 'config_util' from '__main__'”,如何解决? -### 1.程序运行出错,错误提示中包含“ImportError: cannot import name 'config_util' from '__main__'”,如何解决? 出现这种错误,说明使用者很可能是直接运行的.py文件,程序正确的运行方式是在weiboSpider目录下,运行如下命令: -``` + +```bash python3 -m weibo_spider ``` -### 2.程序运行出错,错误提示中包含“'NoneType' object”字样,如何解决? +## 2. 程序运行出错,错误提示中包含“'NoneType' object”字样,如何解决? + 这是最常见的问题之一。出错原因是爬取速度太快,被暂时限制了,限制可能包含爬虫账号限制和ip限制。一般情况下,一段时间后限制会自动解除。可通过降低爬取速度避免被限制,具体修改config.json文件中的如下代码: -``` + +```json "random_wait_pages": [1, 5], "random_wait_seconds": [6, 10], "global_wait": [[1000, 3600], [500, 2000]], ``` + 前两行的意思是每爬取1到5页,随机等待6到10秒。可以通过加快暂停频率(减小random_wait_pages内的值)或增加等待时间(加大random_wait_seconds内的值)避免被限制。最后一行的意思是获取1000页微博,一次性等待3600秒;之后获取500页微博一次性等待2000秒。默认只有两个global_wait配置([1000, 3600]和[500, 2000]),可以添加更多个,也可以自定义。当配置使用完,如默认配置在获取1500(1000+500)页微博后就用完了,之后程序会从第一个配置开始循环使用(获取第1501页到2500页等待3600秒,获取第2501页到第3000页等待2000秒,以此类推)。 -### 3.如何获取微博评论? +## 3. 如何获取微博评论? + 因为限制,只能获取一部分评论,无法获取全部,因此暂时没有添加获取评论功能的计划。 -### 4.有的长微博正文只能获取一部分内容,如何解决? +## 4. 有的长微博正文只能获取一部分内容,如何解决? + 程序是可以获取长微博全文的。程序首先在微博列表页获取微博,如果发现长微博(正文没有显示完整,以“全文”代替部分内容的微博),会先保存这个不全的内容,然后去该长微博的详情页尝试获取全文,如果获取成功,获取的内容就是微博文本;如果获取失败,等待若干秒重新获取;如果连续尝试5次都失败,就用上面不全的内容代替。这样做的原因是避免因部分长微博获取失败而卡住。如果想尝试更多次,可以修改comment_parser.py文件get_long_weibo方法内for循环的次数。 -### 5.如何按指定关键词获取微博? +## 5. 如何按指定关键词获取微博? + 请使用[weibo-search](https://github.com/dataabc/weibo-search)。该程序可以连续获取一个或多个微博关键词搜索结果,并将结果写入文件(可选)、数据库(可选)等。所谓微博关键词搜索即:搜索正文中包含指定关键词的微博,可以指定搜索的时间范围。对于非常热门的关键词,一天的时间范围,可以获得1000万以上的搜索结果,N天的时间范围就可以获得1000万 X N搜索结果。对于大多数关键词,一天产生的相应微博数量应该在1000万条以下,因此可以说该程序可以获得大部分关键词的全部或近似全部的搜索结果。而且该程序可以获得搜索结果的所有信息,本程序获得的微博信息该程序都能获得。 -### 6.如何获取微博用户关注列表中用户的user_id? +## 6. 如何获取微博用户关注列表中用户的user_id? + 请使用[weibo-follow](https://github.com/dataabc/weibo-follow)。该程序可以利用一个user_id,获取该user_id微博用户关注人的user_id,一个user_id最多可以获得200个user_id,并写入user_id_list.txt文件。程序支持读文件,利用这200个user_id,可以获得最多200X200=40000个user_id。再利用这40000个user_id可以得到40000X200=8000000个user_id,如此反复,以此类推,可以获得大量user_id。本项目也支持读文件,将上述程序的结果文件user_id_list.txt路径赋值给本项目config.json的user_id_list参数,就可以获得这些user_id用户所发布的大量微博。 -### 7.如何获取自己的微博? +## 7. 如何获取自己的微博? + 修改page_parser.py中__init__方法,将self.url修改为: -``` + +```python self.url = "https://weibo.cn/%s/profile?page=%d" % (user_uri, page) ``` diff --git a/docs/academic.md b/docs/academic.md index ec2ced91..1f378eaa 100644 --- a/docs/academic.md +++ b/docs/academic.md @@ -1,8 +1,8 @@ -## 学术研究 +# 学术研究 本项目通过获取微博数据,为写论文、做研究等非商业项目提供所需数据。下面是一些在论文或研究等方面使用过本程序的项目。在一些涉及隐私的描述上,已与研究者做了沟通,在下面的描述中只介绍研究者 允许展示的部分。如果部分信息研究者之前同意展示并且已经写在了本文档中,现在又不想展示了,可以通过邮件(chillychen1991@gmail.com)或issue的方式告诉我,我会删除相关信息。同时,使用本项目写论文或做其它学术研究的朋友,如果想把自己的研究成果展示在下面,也可以通过邮件或issue的方式告诉我。 *** -英国伦敦国王学院[Mak-LokGay](https://github.com/Mak-LokGay)的[毕业论文](https://github.com/Mak-LokGay/KCL_Dissertation) +- 英国伦敦国王学院[Mak-LokGay](https://github.com/Mak-LokGay)的[毕业论文](https://github.com/Mak-LokGay/KCL_Dissertation) diff --git a/docs/automation.md b/docs/automation.md index 933a9ff9..2d8970dc 100644 --- a/docs/automation.md +++ b/docs/automation.md @@ -1,40 +1,58 @@ -## 定期自动爬取微博(可选) -我们爬取了微博以后,很多微博账号又可能发了一些新微博,定期自动爬取微博就是每隔一段时间自动运行程序,自动爬取这段时间产生的新微博(忽略以前爬过的旧微博)。本部分为可选部分,如果不需要可以忽略。
+# 定期自动爬取微博(可选) + +我们爬取了微博以后,很多微博账号又可能发了一些新微博,定期自动爬取微博就是每隔一段时间自动运行程序,自动爬取这段时间产生的新微博(忽略以前爬过的旧微博)。本部分为可选部分,如果不需要可以忽略。 + 思路是**利用第三方软件,如crontab,让程序每隔一段时间运行一次**。因为是要跳过以前爬过的旧微博,只爬新微博。所以需要**设置一个动态的since_date**。很多时候我们使用的since_date是固定的,比如since_date="2018-01-01",程序就会按照这个设置从最新的微博一直爬到发布时间为2018-01-01的微博(包括这个时间)。因为我们想追加新微博,跳过旧微博。第二次爬取时since_date值就应该是当前时间到上次爬取的时间。 如果我们使用最原始的方式实现追加爬取,应该是这样: -``` + +```text 假如程序第一次执行时间是2019-06-06,since_date假如为2018-01-01,那这一次就是爬取从2018-01-01到2019-06-06这段时间用户所发的微博; 第二次爬取,我们想要接着上次的爬,那since_date的值应该是上次程序执行的日期,即2019-06-06 ``` -上面的方法太麻烦,因为每次都要手动设置since_date。因此我们需要动态设置since_date,即程序根据实际情况,自动生成since_date。
-有两种方法实现动态更新since_date,**推荐使用方法二**。
-**方法一:将since_date设置成整数**
+ +上面的方法太麻烦,因为每次都要手动设置since_date。因此我们需要动态设置since_date,即程序根据实际情况,自动生成since_date。 + +有两种方法实现动态更新since_date,**推荐使用方法二**。 + +## 方法一:将since_date设置成整数 + 将config.json文件中的since_date设置成整数,如: -``` + +```json "since_date": 10, ``` -这个配置告诉程序爬取最近10天的微博,更准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。这样since_date就是一个动态的变量,每次程序执行时,它的值就是当前日期减10。配合crontab每9天或10天执行一次,就实现了定期追加爬取。
-**方法二:将上次执行程序的时间写入文件(推荐)**
-这个方法很简单,就是使用[程序设置](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md)中**设置user_id_list**的第二种方法设置user_id_list,这样设置就全部结束了。
+ +这个配置告诉程序爬取最近10天的微博,更准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。这样since_date就是一个动态的变量,每次程序执行时,它的值就是当前日期减10。配合crontab每9天或10天执行一次,就实现了定期追加爬取。 + +## 方法二:将上次执行程序的时间写入文件(推荐) + +这个方法很简单,就是使用[程序设置](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md)中**设置user_id_list**的第二种方法设置user_id_list,这样设置就全部结束了。 + 说下这个方法的好处和原理,假如你的txt文件内容为: -``` + +```text 1669879400 1223178222 胡歌 1729370543 郭碧婷 2019-01-01 19:28 ``` + 第一次执行时,因为第一行和第二行都没有写时间,程序会按照config.json文件中since_date的值爬取,第三行有时间“2019-01-01 19:28”,程序就会把这个时间当作since_date。每个用户爬取结束程序都会自动更新txt文件,每一行第一部分是user_id,第二部分是用户昵称,第三部分是程序**准备**爬取该用户第一条微博(最新微博)时的时间。爬完三个用户后,txt文件的内容自动更新为: -``` + +```text 1669879400 Dear-迪丽热巴 2020-01-13 19:18 1223178222 胡歌 2020-01-13 19:28 1729370543 郭碧婷 2020-01-13 19:33 ``` + 下次再爬取微博的时候,程序会把每行的时间数据作为since_date。这样的好处一是不用修改since_date,程序自动更新;二是每一个用户都可以单独拥有只属于自己的since_date,每个用户的since_date相互独立,互不干扰。since_date既可以是“yyyy-mm-dd”格式,也可以是“yyyy-mm-dd hh:mm”格式。比如,现在又添加了一个新用户,例如杨紫,你想获取她从2018-01-23到现在的全部微博,只需要这样修改txt文件: -``` + +```text 1669879400 Dear-迪丽热巴 2020-01-13 19:18 1223178222 胡歌 2020-01-13 19:28 1729370543 郭碧婷 2020-01-13 19:33 1227368500 杨紫 2018-01-23 ``` + 注意每一行的用户配置参数以空格分隔,如果第一个参数全部由数字组成,程序就认为此行为一个用户的配置,否则程序会认为该行只是注释,跳过该行;第二个参数可以为任意格式,建议写用户昵称;第三个如果是日期格式(yyyy-mm-dd),程序就将该日期设置为用户自己的since_date,否则使用config.json中的since_date爬取该用户的微博,第二个参数和第三个参数也可以不填。 -推荐第二种方法,本方法是[Evifly](https://github.com/Evifly)想出的,非常热心非常有想法的网友,在此感谢。
+推荐第二种方法,本方法是[Evifly](https://github.com/Evifly)想出的,非常热心非常有想法的网友,在此感谢。 diff --git a/docs/contributors.md b/docs/contributors.md index 12651fd6..a58eb5c9 100644 --- a/docs/contributors.md +++ b/docs/contributors.md @@ -6,19 +6,19 @@ ## 主要代码开发者 -|[dataabc](https://github.com/dataabc) |[songzy12](https://github.com/songzy12) | -| - | - | +| [dataabc](https://github.com/dataabc) | [songzy12](https://github.com/songzy12) | +| - | - | ## 代码贡献者 -|[codermino](https://github.com/codermino) |[duangan1](https://github.com/duangan1) | [MKSP2015](https://github.com/MKSP2015) | +| [codermino](https://github.com/codermino) | [duangan1](https://github.com/duangan1) | [MKSP2015](https://github.com/MKSP2015) | | - | - | - | ## 优质issue提出者 | | | | | | | | - | - | - | - | - | - | -| [13531982270](https://github.com/13531982270) | [Archenemy61](https://github.com/Archenemy61) | [arctanx](https://github.com/arctanx) |[bossming](https://github.com/bossming)|[bubblesran](https://github.com/bubblesran)| [cangling](https://github.com/cangling)| +| [13531982270](https://github.com/13531982270) | [Archenemy61](https://github.com/Archenemy61) | [arctanx](https://github.com/arctanx) | [bossming](https://github.com/bossming) | [bubblesran](https://github.com/bubblesran) | [cangling](https://github.com/cangling)| | [Ccccche](https://github.com/Ccccche) | [Evifly](https://github.com/Evifly) | [gudaost](https://github.com/gudaost) | [Hylan129](https://github.com/Hylan129) | [HZzzzy](https://github.com/HZzzzy) | [kur0mi](https://github.com/kur0mi) | | [leonall](https://github.com/leonall) | [liu-song](https://github.com/liu-song) | [Issac110](https://github.com/Issac110) | [MengyingQian](https://github.com/MengyingQian) | [PandGnone](https://github.com/PandGnone) | [PLQin](https://github.com/PLQin) | | [redMUSCLE](https://github.com/redMUSCLE) | [shengdade](https://github.com/shengdade) | [softrime](https://github.com/softrime) | [SugimitoYuuji](https://github.com/SugimitoYuuji) | [sunbat](https://github.com/sunbat) | [taichifox95](https://github.com/taichifox95) | diff --git a/docs/cookie.md b/docs/cookie.md index 3a202314..57db59cf 100644 --- a/docs/cookie.md +++ b/docs/cookie.md @@ -1,9 +1,10 @@ -## 如何获取cookie -1.用Chrome打开
-2.输入微博的用户名、密码,登录,如图所示: -![](https://picture.cognize.me/cognize/github/weibospider/cookie1.png) -登录成功后会跳转到;
-3.按F12键打开Chrome开发者工具,在地址栏输入并跳转到,跳转后会显示如下类似界面: -![](https://picture.cognize.me/cognize/github/weibospider/cookie2.png) -4.依此点击Chrome开发者工具中的Network->Name中的weibo.cn->Headers->Request Headers,"Cookie:"后的值即为我们要找的cookie值,复制即可,如图所示: -![](https://picture.cognize.me/cognize/github/weibospider/cookie3.png) \ No newline at end of file +# 如何获取cookie + +1. 用Chrome打开; +2. 输入微博的用户名、密码,登录,如图所示: +![weibo log in page](https://picture.cognize.me/cognize/github/weibospider/cookie1.png) +登录成功后会跳转到; +3. 按F12键打开Chrome开发者工具,在地址栏输入并跳转到,跳转后会显示如下类似界面: +![chrome debugger network tab](https://picture.cognize.me/cognize/github/weibospider/cookie2.png) +4. 依此点击Chrome开发者工具中的Network->Name中的weibo.cn->Headers->Request Headers,"Cookie:"后的值即为我们要找的cookie值,复制即可,如图所示: +![cookie in request headers section](https://picture.cognize.me/cognize/github/weibospider/cookie3.png) \ No newline at end of file diff --git a/docs/example.md b/docs/example.md index 1d4519a5..d12e08bb 100644 --- a/docs/example.md +++ b/docs/example.md @@ -1,6 +1,8 @@ -## 实例 +# 实例 + 以爬取迪丽热巴的微博为例,我们需要修改**config.json**文件,文件内容如下: -``` + +```json { "user_id_list": ["1669879400"], "filter": 1, @@ -15,23 +17,44 @@ ``` 对于上述参数的含义以及取值范围,这里仅作简单介绍,详细信息见[程序设置](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md)。 ->**user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](https://github.com/dataabc/weiboSpider/blob/master/docs/userid.md);
**filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发);
**since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值;
**end_date**代表我们要爬取end_date日期之前发布的微博,since_date配合end_date,表示我们要爬取发布日期在since_date和end_date之间的微博,包含边界,如果end_date值为"now",表示爬取发布日期从since_date到现在的微博;
**write_mode**代表结果文件的保存类型,我想要把结果写入txt文件、csv文件和json文件,所以它的值为["csv", "txt", "json"],如果你想写入数据库,具体设置见[设置数据库](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md#设置数据库可选);
**pic_download**值为1代表下载微博中的图片,值为0代表不下载;
**video_download**值为1代表下载微博中的视频,值为0代表不下载;
**result_dir_name**控制结果文件夹名,值为1代表文件夹名是用户id,值为0代表文件夹名是用户昵称;
**cookie**是爬虫微博的cookie,具体如何获取cookie见[cookie文档](https://github.com/dataabc/weiboSpider/blob/master/docs/cookie.md),获取cookie后把"your cookie"替换成真实的cookie值即可。
+ +- **user_id_list**代表我们要爬取的微博用户的user_id,可以是一个或多个,也可以是文件路径,微博用户Dear-迪丽热巴的user_id为1669879400,具体如何获取user_id见[如何获取user_id](https://github.com/dataabc/weiboSpider/blob/master/docs/userid.md); +- **filter**的值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发); +- **since_date**代表我们要爬取since_date日期之后发布的微博,因为我要爬迪丽热巴的全部原创微博,所以since_date设置了一个非常早的值; +- **end_date**代表我们要爬取end_date日期之前发布的微博,since_date配合end_date,表示我们要爬取发布日期在since_date和end_date之间的微博,包含边界,如果end_date值为"now",表示爬取发布日期从since_date到现在的微博; +- **write_mode**代表结果文件的保存类型,我想要把结果写入txt文件、csv文件和json文件,所以它的值为["csv", "txt", "json"],如果你想写入数据库,具体设置见[设置数据库](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md#设置数据库可选); +- **pic_download**值为1代表下载微博中的图片,值为0代表不下载; +- **video_download**值为1代表下载微博中的视频,值为0代表不下载; +- **result_dir_name**控制结果文件夹名,值为1代表文件夹名是用户id,值为0代表文件夹名是用户昵称; +- **cookie**是爬虫微博的cookie,具体如何获取cookie见[cookie文档](https://github.com/dataabc/weiboSpider/blob/master/docs/cookie.md),获取cookie后把"your cookie"替换成真实的cookie值即可。 cookie修改完成后在weiboSpider目录下运行如下命令: + ```bash $ python3 -m weibo_spider ``` -程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个json文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md#设置数据库可选)部分。
-
-**csv结果文件如下所示:** -![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png)*1669879400.csv*
-
-**txt结果文件如下所示:** -![](https://picture.cognize.me/cognize/github/weibospider/weibo_txt.png)*1669879400.txt*
-
-json文件包含迪丽热巴的用户信息和上千条微博信息,内容较多。为了表达清晰,这里仅展示两条微博。
-**json结果文件如下所示:** -``` + +程序会自动生成一个weibo文件夹,我们以后爬取的所有微博都被存储在这里。然后程序在该文件夹下生成一个名为"Dear-迪丽热巴"的文件夹,迪丽热巴的所有微博爬取结果都在这里。"Dear-迪丽热巴"文件夹里包含一个csv文件、一个txt文件、一个json文件、一个img文件夹和一个video文件夹,img文件夹用来存储下载到的图片,video文件夹用来存储下载到的视频。如果你设置了保存数据库功能,这些信息也会保存在数据库里,数据库设置见[设置数据库](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md#设置数据库可选)部分。 + +## csv结果文件如下所示 + +*1669879400.csv* + +![](https://picture.cognize.me/cognize/github/weibospider/weibo_csv.png) + +## txt结果文件如下所示 + +*1669879400.txt* + +![](https://picture.cognize.me/cognize/github/weibospider/weibo_txt.png) + +json文件包含迪丽热巴的用户信息和上千条微博信息,内容较多。为了表达清晰,这里仅展示两条微博。 + +## json结果文件如下所示 + +*1669879400.json* + +```json { "user": { "id": "1669879400", @@ -76,13 +99,21 @@ json文件包含迪丽热巴的用户信息和上千条微博信息,内容较 ] } ``` -*1669879400.json*
-
-**下载的图片如下所示:** -![](https://picture.cognize.me/cognize/github/weibospider/img.png)*img文件夹*
-本次下载了793张图片,大小一共1.21GB,包括她原创微博中的图片和转发微博转发理由中的图片。图片名为yyyymmdd+微博id的形式,若某条微博存在多张图片,则图片名中还会包括它在微博图片中的序号。若某张图片因为网络等原因下载失败,程序则会以“weibo_id:pic_url”的形式将出错微博id和图片url写入同文件夹下的not_downloaded.txt里;
-
-**下载的视频如下所示:** -![](https://picture.cognize.me/cognize/github/weibospider/video.png)*video文件夹*
-本次下载了70个视频,是她原创微博中的视频,视频名为yyyymmdd+微博id的形式。其中有一个视频因为网络原因下载失败,程序将它的微博id和视频url以“weibo_id:video_url”的形式写到了同文件夹下的not_downloaded.txt里。
+ +## 下载的图片如下所示 + +*img文件夹* + +![](https://picture.cognize.me/cognize/github/weibospider/img.png) + +本次下载了793张图片,大小一共1.21GB,包括她原创微博中的图片和转发微博转发理由中的图片。图片名为yyyymmdd+微博id的形式,若某条微博存在多张图片,则图片名中还会包括它在微博图片中的序号。若某张图片因为网络等原因下载失败,程序则会以“weibo_id:pic_url”的形式将出错微博id和图片url写入同文件夹下的not_downloaded.txt里; + +## 下载的视频如下所示 + +*video文件夹* + +![](https://picture.cognize.me/cognize/github/weibospider/video.png) + +本次下载了70个视频,是她原创微博中的视频,视频名为yyyymmdd+微博id的形式。其中有一个视频因为网络原因下载失败,程序将它的微博id和视频url以“weibo_id:video_url”的形式写到了同文件夹下的not_downloaded.txt里。 + 因为我本地没有安装MySQL数据库和MongoDB数据库,所以暂时设置成不写入数据库。如果你想要将爬取结果写入数据库,只需要先安装数据库(MySQL或MongoDB),再安装对应包(pymysql或pymongo),然后将mysql_write或mongodb_write值设置为1即可。写入MySQL需要用户名、密码等配置信息,这些配置如何设置见[设置数据库](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md#设置数据库可选)部分。 diff --git a/docs/settings.md b/docs/settings.md index bdf48d50..9b89ae93 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -1,10 +1,14 @@ -## 程序设置 +# 程序设置 + **源码下载安装**的用户在weiboSpider目录下运行如下命令,**pip安装**的用户在任意有写权限的目录运行如下命令: + ```bash $ python3 -m weibo_spider ``` + 第一次运行会生成**config.json**文件,请打开**config.json**文件,你会看到如下内容: -``` + +```json { "user_id_list": ["1669879400"], "filter": 1, @@ -28,124 +32,188 @@ $ python3 -m weibo_spider "sqlite_config": "weibo.db" } ``` -下面讲解每个参数的含义与设置方法。
-**设置user_id_list**
+ +下面讲解每个参数的含义与设置方法。 + +## 设置user_id_list + user_id_list是我们要爬取的微博的id,可以是一个,也可以是多个,例如: -``` + +```json "user_id_list": ["1223178222", "1669879400", "1729370543"], ``` -上述代码代表我们要连续爬取user_id分别为“1223178222”、 “1669879400”、 “1729370543”的三个用户的微博,具体如何获取user_id见[如何获取user_id](https://github.com/dataabc/weiboSpider/blob/master/docs/userid.md)。
-user_id_list的值也可以是文件路径,我们可以把要爬的所有微博用户的user_id都写到txt文件里,然后把文件的位置路径赋值给user_id_list,**推荐这种方式**。
+ +上述代码代表我们要连续爬取user_id分别为“1223178222”、 “1669879400”、 “1729370543”的三个用户的微博,具体如何获取user_id见[如何获取user_id](https://github.com/dataabc/weiboSpider/blob/master/docs/userid.md)。 + +user_id_list的值也可以是文件路径,我们可以把要爬的所有微博用户的user_id都写到txt文件里,然后把文件的位置路径赋值给user_id_list,**推荐这种方式**。 + 在txt文件中,每个user_id占一行,也可以在user_id后面加注释(可选),如用户昵称等信息,user_id和注释之间必需要有空格,文件名任意,类型为txt,位置位于本程序的同目录下,文件内容示例如下: -``` + +```text 1223178222 胡歌 1669879400 迪丽热巴 1729370543 郭碧婷 ``` + 假如文件叫user_id_list.txt,则user_id_list设置代码为: -``` + +```json "user_id_list": "user_id_list.txt", ``` -**设置filter**
+ +## 设置filter + filter控制爬取范围,值为1代表爬取全部原创微博,值为0代表爬取全部微博(原创+转发)。例如,如果要爬全部原创微博,请使用如下代码: -``` + +```json "filter": 1, ``` -**设置since_date**
+ +## 设置since_date + since_date值可以是日期,也可以是整数。如果是日期,代表爬取该日期之后的微博,格式应为“yyyy-mm-dd”,如: -``` + +```json "since_date": "2018-01-01", ``` -代表爬取从2018年1月1日到现在的微博。
+ +代表爬取从2018年1月1日到现在的微博。 + 如果是整数,代表爬取最近n天的微博,如: -``` + +```json "since_date": 10, ``` -代表爬取最近10天的微博,这个说法不是特别准确,准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。
-**since_date是所有user的爬取起始时间,非常不灵活。如果你要爬多个用户,并且想单独为每个用户设置一个since_date,可以使用[定期自动爬取微博](https://github.com/dataabc/weiboSpider/blob/master/docs/automation.md)方法二中的方法,该方法可以为多个用户设置不同的since_date,非常灵活。**
-**设置end_date**
-end_date值可以是日期,也可以是"now"。如果是日期,代表爬取该日期之前的微博,格式应为“yyyy-mm-dd”;如果是"now",代表爬取发布日期从since_date到现在的微博。since_date配合end_date,表示爬取发布日期在since_date和end_date之间的微博,包含边界。since_date是起始日期,end_date是结束日期,因此end_date时间应晚于since_date。注意,since_date即可以通过config.json文件的since_date参数设置,也可以通过user_id_list.txt设置;而end_date只能通过config.json文件的end_date参数设置,是全局变量,所有user_id都使用同一个end_date。
-**推荐使用"now"作为end_date值**,当值为"now"时,获取结果是正确和稳定的;当end_date值不是"now"时,在爬微博数非常多的账号时,程序可能不稳定,得到很多空微博页,并且此时无法获取微博中的视频,如果想要获取视频,请为end_date赋值为"now"。
-**设置random_wait_pages**
-random_wait_pages值是一个长度为2的整数列表,代表每爬取x页微博暂停一次,x为整数,值在random_wait_pages列表两个整数之间随机获取。默认值为[1, 5],代表每爬取1到5页暂停一次,如果程序被限制,可以加快暂停频率,即适当减小random_wait_pages内的值。
-**设置random_wait_seconds**
-random_wait_seconds值是一个长度为2的整数列表,代表每次暂停sleep x 秒,x为整数, 值在random_wait_seconds列表两个整数之间随机获取。默认值为[6, 10],代表每次暂停sleep 6到10秒,如果程序被限制,可以增加等待时间,即适当增大random_wait_seconds内的值。
-**设置global_wait**
-global_wait控制全局等待时间,默认值为[[1000, 3600], [500, 2000]],代表获取1000页微博,程序一次性暂停3600秒;之后获取500页微博,程序再一次性暂停2000秒;之后如果再获取1000页微博,程序一次性暂停3600秒,以此类推。默认的只有前面的两个全局等待时间([1000, 3600]和[500, 2000]),可以设置多个,如值可以为[[1000, 3600], [500, 3000], [700, 3600]],程序会根据配置依次等待对应时间,如果配置全部被使用,程序会从第一个配置开始,依次使用,循环往复。
-**设置write_mode**
+ +代表爬取最近10天的微博,这个说法不是特别准确,准确说是爬取发布时间从**10天前到本程序开始执行时**之间的微博。 + +**since_date是所有user的爬取起始时间,非常不灵活。如果你要爬多个用户,并且想单独为每个用户设置一个since_date,可以使用[定期自动爬取微博](https://github.com/dataabc/weiboSpider/blob/master/docs/automation.md)方法二中的方法,该方法可以为多个用户设置不同的since_date,非常灵活。** + +## 设置end_date + +end_date值可以是日期,也可以是"now"。如果是日期,代表爬取该日期之前的微博,格式应为“yyyy-mm-dd”;如果是"now",代表爬取发布日期从since_date到现在的微博。since_date配合end_date,表示爬取发布日期在since_date和end_date之间的微博,包含边界。since_date是起始日期,end_date是结束日期,因此end_date时间应晚于since_date。注意,since_date即可以通过config.json文件的since_date参数设置,也可以通过user_id_list.txt设置;而end_date只能通过config.json文件的end_date参数设置,是全局变量,所有user_id都使用同一个end_date。 + +**推荐使用"now"作为end_date值**,当值为"now"时,获取结果是正确和稳定的;当end_date值不是"now"时,在爬微博数非常多的账号时,程序可能不稳定,得到很多空微博页,并且此时无法获取微博中的视频,如果想要获取视频,请为end_date赋值为"now"。 + +## 设置random_wait_pages + +random_wait_pages值是一个长度为2的整数列表,代表每爬取x页微博暂停一次,x为整数,值在random_wait_pages列表两个整数之间随机获取。默认值为[1, 5],代表每爬取1到5页暂停一次,如果程序被限制,可以加快暂停频率,即适当减小random_wait_pages内的值。 + +## 设置random_wait_seconds + +random_wait_seconds值是一个长度为2的整数列表,代表每次暂停sleep x 秒,x为整数, 值在random_wait_seconds列表两个整数之间随机获取。默认值为[6, 10],代表每次暂停sleep 6到10秒,如果程序被限制,可以增加等待时间,即适当增大random_wait_seconds内的值。 + +## 设置global_wait + +global_wait控制全局等待时间,默认值为[[1000, 3600], [500, 2000]],代表获取1000页微博,程序一次性暂停3600秒;之后获取500页微博,程序再一次性暂停2000秒;之后如果再获取1000页微博,程序一次性暂停3600秒,以此类推。默认的只有前面的两个全局等待时间([1000, 3600]和[500, 2000]),可以设置多个,如值可以为[[1000, 3600], [500, 3000], [700, 3600]],程序会根据配置依次等待对应时间,如果配置全部被使用,程序会从第一个配置开始,依次使用,循环往复。 + +## 设置write_mode + write_mode控制结果文件格式,取值范围是csv、txt、json、mongo、mysql和sqlite,分别代表将结果文件写入csv、txt、json、MongoDB、MySQL和SQLite数据库。write_mode可以同时包含这些取值中的一个或几个,如: -``` + +```json "write_mode": ["csv", "txt"], ``` -代表将结果信息写入csv文件和txt文件。特别注意,如果你想写入数据库,除了在write_mode添加对应数据库的名字外,还应该安装相关数据库和对应python模块,具体操作见[设置数据库](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md#设置数据库可选)部分。
-**设置pic_download**
+ +代表将结果信息写入csv文件和txt文件。特别注意,如果你想写入数据库,除了在write_mode添加对应数据库的名字外,还应该安装相关数据库和对应python模块,具体操作见[设置数据库](https://github.com/dataabc/weiboSpider/blob/master/docs/settings.md#设置数据库可选)部分。 + +## 设置pic_download + pic_download控制是否下载微博中的图片,值为1代表下载,值为0代表不下载,如 -``` + +```json "pic_download": 1, ``` -代表下载微博中的图片。
-**设置video_download**
+ +代表下载微博中的图片。 + +## 设置video_download + video_download控制是否下载微博中的视频,值为1代表下载,值为0代表不下载,如 -``` + +```json "video_download": 1, ``` -代表下载微博中的视频。
-**设置result_dir_name**
+ +代表下载微博中的视频。 + +## 设置result_dir_name + result_dir_name控制结果目录的名字,可选值为0和1,默认值为0: -``` + +```json "result_dir_name": 0, ``` -值为0表示将结果文件保存在以用户昵称为名的文件夹里,这样结果更清晰;值为1表示将结果保存在以用户id为名的文件夹里,这样更能保证多次爬取的一致性,因为用户昵称可以改变,用户id是不变的。
-**设置cookie**
-请按照[如何获取cookie](https://github.com/dataabc/weiboSpider/blob/master/docs/cookie.md),获取cookie,然后将“your cookie”替换成真实的cookie值。
-**设置mysql_config(可选)**
-mysql_config控制mysql参数配置。如果你不需要将结果信息写入mysql,这个参数可以忽略,即删除或保留都无所谓;如果你需要写入mysql且config.json文件中mysql_config的配置与你的mysql配置不一样,请将该值改成你自己mysql中的参数配置。
-**设置sqlite_config(可选)**
+ +值为0表示将结果文件保存在以用户昵称为名的文件夹里,这样结果更清晰;值为1表示将结果保存在以用户id为名的文件夹里,这样更能保证多次爬取的一致性,因为用户昵称可以改变,用户id是不变的。 + +## 设置cookie + +请按照[如何获取cookie](https://github.com/dataabc/weiboSpider/blob/master/docs/cookie.md),获取cookie,然后将“your cookie”替换成真实的cookie值。 + +## 设置mysql_config(可选) + +mysql_config控制mysql参数配置。如果你不需要将结果信息写入mysql,这个参数可以忽略,即删除或保留都无所谓;如果你需要写入mysql且config.json文件中mysql_config的配置与你的mysql配置不一样,请将该值改成你自己mysql中的参数配置。 + +## 设置sqlite_config(可选) + sqlite_config控制SQLite参数配置,代表SQLite数据库的保存路径,可根据自己需求修改。 ## 设置数据库(可选) -本部分是可选部分,如果不需要将爬取信息写入数据库,可跳过这一步。本程序目前支持MySQL数据库和MongoDB数据库,如果你需要写入其它数据库,可以参考这两个数据库的写法自己编写。
-**MySQL数据库写入**
+ +本部分是可选部分,如果不需要将爬取信息写入数据库,可跳过这一步。本程序目前支持MySQL数据库和MongoDB数据库,如果你需要写入其它数据库,可以参考这两个数据库的写法自己编写。 + +## MySQL数据库写入 + 要想将爬取信息写入MySQL,请根据自己的系统环境安装MySQL,然后命令行执行: + ```bash $ pip install pymysql ``` -**MongoDB数据库写入**
+ +## MongoDB数据库写入 + 要想将爬取信息写入MongoDB,请根据自己的系统环境安装MongoDB,然后命令行执行: + ```bash $ pip install pymongo ``` + MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为"weibo"的数据库,然后再创建"user"表和"weibo"表,包含爬取的所有内容。爬取到的微博**用户信息**或插入或更新,都会存储到user表里;爬取到的**微博信息**或插入或更新,都会存储到weibo表里,两个表通过user_id关联。如果想了解两个表的具体字段,请点击"详情"。 +
+ 详情 - -**user表**
-**id**:存储用户id,如"1669879400";
-**nickname**:存储用户昵称,如"Dear-迪丽热巴";
-**gender**:存储用户性别;
-**location**:存储用户所在地;
-**birthday**:存储用户出生日期;
-**description**:存储用户简介;
-**verified_reason**:存储用户认证;
-**talent**:存储用户标签;
-**education**:存储用户学习经历;
-**work**:存储用户工作经历;
-**weibo_num**:存储微博数;
-**following**:存储关注数;
-**followers**:存储粉丝数。
+ +- **user表** +- **id**:存储用户id,如"1669879400"; +- **nickname**:存储用户昵称,如"Dear-迪丽热巴"; +- **gender**:存储用户性别; +- **location**:存储用户所在地; +- **birthday**:存储用户出生日期; +- **description**:存储用户简介; +- **verified_reason**:存储用户认证; +- **talent**:存储用户标签; +- **education**:存储用户学习经历; +- **work**:存储用户工作经历; +- **weibo_num**:存储微博数; +- **following**:存储关注数; +- **followers**:存储粉丝数。 + *** -**weibo表**
-**id**:存储微博id;
-**user_id**:存储微博发布者的用户id,如"1669879400";
-**content**:存储微博正文;
-**article_url**:存储微博中头条文章的url,若微博中不存在头条文章,则值为'';
-**original_pictures**:存储原创微博的原始图片url和转发微博转发理由中的图片url。若某条微博有多张图片,则存储多个url,以英文逗号分割;若某微博没有图片,则值为"无";
-**retweet_pictures**:存储被转发微博中的原始图片url。当最新微博为原创微博或者为没有图片的转发微博时,则值为"无",否则为被转发微博的图片url。若有多张图片,则存储多个url,以英文逗号分割;
-**publish_place**:存储微博的发布位置。如果某条微博没有位置信息,则值为"无";
-**publish_time**:存储微博的发布时间;
-**up_num**:存储微博获得的点赞数;
-**retweet_num**:存储微博获得的转发数;
-**comment_num**:存储微博获得的评论数;
-**publish_tool**:存储微博的发布工具。 + +- **weibo表** +- **id**:存储微博id; +- **user_id**:存储微博发布者的用户id,如"1669879400"; +- **content**:存储微博正文; +- **article_url**:存储微博中头条文章的url,若微博中不存在头条文章,则值为''; +- **original_pictures**:存储原创微博的原始图片url和转发微博转发理由中的图片url。若某条微博有多张图片,则存储多个url,以英文逗号分割;若某微博没有图片,则值为"无"; +- **retweet_pictures**:存储被转发微博中的原始图片url。当最新微博为原创微博或者为没有图片的转发微博时,则值为"无",否则为被转发微博的图片url。若有多张图片,则存储多个url,以英文逗号分割; +- **publish_place**:存储微博的发布位置。如果某条微博没有位置信息,则值为"无"; +- **publish_time**:存储微博的发布时间; +- **up_num**:存储微博获得的点赞数; +- **retweet_num**:存储微博获得的转发数; +- **comment_num**:存储微博获得的评论数; +- **publish_tool**:存储微博的发布工具。
diff --git a/docs/userid.md b/docs/userid.md index 7fc2a2c4..68d2e595 100644 --- a/docs/userid.md +++ b/docs/userid.md @@ -1,11 +1,16 @@ ## 如何获取user_id -1.打开网址,搜索我们要找的人,如"迪丽热巴",进入她的主页;
-![](https://picture.cognize.me/cognize/github/weibospider/user_home.png) -2.按照上图箭头所指,点击"资料"链接,跳转到用户资料页面;
-![](https://picture.cognize.me/cognize/github/weibospider/user_info.png) -如上图所示,迪丽热巴微博资料页的地址为"",其中的"1669879400"即为此微博的user_id。
-事实上,此微博的user_id也包含在用户主页()中,之所以我们还要点击主页中的"资料"来获取user_id,是因为很多用户的主页不是""的形式,而是""或""的形式。其中"微号"和user_id都是一串数字,如果仅仅通过主页地址提取user_id,很容易将"微号"误认为user_id。
-上述可以获得一个user_id,如果想要获得**大量**微博,见[如何获取大量user_id](#如何获取大量user_id)部分。
+ +1. 打开网址,搜索我们要找的人,如"迪丽热巴",进入她的主页; + ![user home](https://picture.cognize.me/cognize/github/weibospider/user_home.png) +2. 按照上图箭头所指,点击"资料"链接,跳转到用户资料页面; + ![user info](https://picture.cognize.me/cognize/github/weibospider/user_info.png) + +如上图所示,迪丽热巴微博资料页的地址为"",其中的"1669879400"即为此微博的user_id。 + +事实上,此微博的user_id也包含在用户主页()中,之所以我们还要点击主页中的"资料"来获取user_id,是因为很多用户的主页不是""的形式,而是""或""的形式。其中"微号"和user_id都是一串数字,如果仅仅通过主页地址提取user_id,很容易将"微号"误认为user_id。 + +上述可以获得一个user_id,如果想要获得**大量**微博,见[如何获取大量user_id](#如何获取大量user_id)部分。 ## 如何获取大量user_id -[如何获取user_id](#如何获取user_id)部分可以获得一个user_id,可以利用这一个user_id,获取该user_id微博用户关注人的user_id,一个user_id最多可以获得200个user_id,并写入user_id_list.txt文件。程序支持读文件,利用这200个user_id,可以获得最多200X200=40000个user_id。再利用这40000个user_id可以得到40000X200=8000000个user_id,如此反复,以此类推,可以获得大量user_id。本项目也支持读文件,将上述程序的结果文件user_id_list.txt路径赋值给本项目config.json的user_id_list参数,就可以获得这些user_id用户所发布的大量微博。
\ No newline at end of file + +[如何获取user_id](#如何获取user_id)部分可以获得一个user_id,可以利用这一个user_id,获取该user_id微博用户关注人的user_id,一个user_id最多可以获得200个user_id,并写入user_id_list.txt文件。程序支持读文件,利用这200个user_id,可以获得最多200X200=40000个user_id。再利用这40000个user_id可以得到40000X200=8000000个user_id,如此反复,以此类推,可以获得大量user_id。本项目也支持读文件,将上述程序的结果文件user_id_list.txt路径赋值给本项目config.json的user_id_list参数,就可以获得这些user_id用户所发布的大量微博。 From 7f35979a77d5aae74ecd13ba16bcdc61370c21fc Mon Sep 17 00:00:00 2001 From: mtuwei <32591958+mtuwei@users.noreply.github.com> Date: Sun, 1 Aug 2021 22:06:27 +0800 Subject: [PATCH 329/363] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2cbbb543..8c4a6404 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,8 @@ - [Weibo Spider](#weibo-spider) - [内容列表](#内容列表) - [获取到的字段](#获取到的字段) + - [用户信息](#用户信息) + - [微博信息](#微博信息) - [示例](#示例) - [运行环境](#运行环境) - [使用说明](#使用说明) @@ -224,7 +226,7 @@ $ python3 -m weibo_spider --u="1669879400,1223178222" ## 定期自动爬取微博(可选) -要想让程序每个一段时间自动爬取,且爬取的内容为新增加的内容(不包括已经获取的微博),请查看[定期自动爬取微博](https://github.com/dataabc/weiboSpider/blob/master/docs/automation.md)。 +要想让程序每隔一段时间自动爬取,且爬取的内容为新增加的内容(不包括已经获取的微博),请查看[定期自动爬取微博](https://github.com/dataabc/weiboSpider/blob/master/docs/automation.md)。 ## 如何获取cookie From 7698890bea80671f3050cb11ebcbff3ea954f37c Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Wed, 18 Aug 2021 22:41:04 +0800 Subject: [PATCH 330/363] Add photo parser and unit test. --- tests/test_parser/test_photo_parser.py | 14 ++++++++++++++ ...14abc1d52605fc00d91279df9ac4c1465c85b91b3.html | 1 + tests/testdata/url_map.json | 3 ++- weibo_spider/parser/photo_parser.py | 15 +++++++++++++++ 4 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 tests/test_parser/test_photo_parser.py create mode 100644 tests/testdata/e4d541ecb02253c14abc1d52605fc00d91279df9ac4c1465c85b91b3.html create mode 100644 weibo_spider/parser/photo_parser.py diff --git a/tests/test_parser/test_photo_parser.py b/tests/test_parser/test_photo_parser.py new file mode 100644 index 00000000..63cebe9b --- /dev/null +++ b/tests/test_parser/test_photo_parser.py @@ -0,0 +1,14 @@ +from unittest.mock import patch + +from weibo_spider.parser.photo_parser import PhotoParser + +from .util import mock_request_get_content + + +@patch('requests.get', mock_request_get_content) +def test_photo_parser(): + photo_parser = PhotoParser(cookie="", user_id=1980768563) + + avatar_album_url = photo_parser.extract_avatar_album_url() + assert (avatar_album_url == + "https://weibo.cn/album/166564740000001980768563?rl=1") diff --git a/tests/testdata/e4d541ecb02253c14abc1d52605fc00d91279df9ac4c1465c85b91b3.html b/tests/testdata/e4d541ecb02253c14abc1d52605fc00d91279df9ac4c1465c85b91b3.html new file mode 100644 index 00000000..161b4305 --- /dev/null +++ b/tests/testdata/e4d541ecb02253c14abc1d52605fc00d91279df9ac4c1465c85b91b3.html @@ -0,0 +1 @@ +微博
霜叶的相册
 微博  相册 
 
TOP
\ No newline at end of file diff --git a/tests/testdata/url_map.json b/tests/testdata/url_map.json index 35758358..2aca0433 100644 --- a/tests/testdata/url_map.json +++ b/tests/testdata/url_map.json @@ -7,5 +7,6 @@ "https://weibo.cn/1669879400?page=2": "tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html", "https://weibo.cn/1669879400?page=3": "tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html", "https://weibo.cn/mblog/picAll/J3xfm61AZ?rl=1": "tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html", - "https://weibo.cn/comment/J5cVGuUNq": "tests/testdata/4d5ed0a3ebd0303cb45edd544dbc0ab5e86d43e103405f0c60515884.html" + "https://weibo.cn/comment/J5cVGuUNq": "tests/testdata/4d5ed0a3ebd0303cb45edd544dbc0ab5e86d43e103405f0c60515884.html", + "https://weibo.cn/1980768563/photo?tf=6_008": "tests/testdata/e4d541ecb02253c14abc1d52605fc00d91279df9ac4c1465c85b91b3.html" } diff --git a/weibo_spider/parser/photo_parser.py b/weibo_spider/parser/photo_parser.py new file mode 100644 index 00000000..33551c7e --- /dev/null +++ b/weibo_spider/parser/photo_parser.py @@ -0,0 +1,15 @@ +from .util import handle_html +from .parser import Parser + + +class PhotoParser(Parser): + def __init__(self, cookie, user_id): + self.cookie = cookie + self.url = "https://weibo.cn/" + str(user_id) + "/photo?tf=6_008" + self.selector = handle_html(self.cookie, self.url) + + def extract_avatar_album_url(self): + # Finds the href attribute of the table td div element with text 头像相册, e.g. + # 头像相册 + result = self.selector.xpath('//img[@alt="头像相册"]/../@href') + return "https://weibo.cn" + result[0] From 1884a8d3adb487fd4453754dffc9a0ddc89a14b1 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Wed, 18 Aug 2021 22:56:16 +0800 Subject: [PATCH 331/363] Add album parser and unit test. --- tests/test_parser/test_album_parser.py | 20 +++++++++++++++++++ ...6f40d3321686ddf871651237c4ac854a5c3eb.html | 1 + tests/testdata/url_map.json | 3 ++- weibo_spider/parser/album_parser.py | 13 ++++++++++++ 4 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 tests/test_parser/test_album_parser.py create mode 100644 tests/testdata/b541fd1751117498b6d6f40d3321686ddf871651237c4ac854a5c3eb.html create mode 100644 weibo_spider/parser/album_parser.py diff --git a/tests/test_parser/test_album_parser.py b/tests/test_parser/test_album_parser.py new file mode 100644 index 00000000..361949e4 --- /dev/null +++ b/tests/test_parser/test_album_parser.py @@ -0,0 +1,20 @@ +from unittest.mock import patch + +from .util import mock_request_get_content +from weibo_spider.parser.album_parser import AlbumParser + + +@patch('requests.get', mock_request_get_content) +def test_album_parser(): + album_parser = AlbumParser( + cookie="", + album_url="https://weibo.cn/album/166564740000001980768563?rl=1") + + pic_urls = album_parser.extract_pic_urls() + assert (len(pic_urls) == 4) + assert (pic_urls == [ + 'http://wx1.sinaimg.cn/wap180/76102133ly8ga961tpte6j20u00u0q65.jpg', + 'http://wx2.sinaimg.cn/wap180/76102133ly8fwr33wpn8fj20v90v9tbw.jpg', + 'http://wx4.sinaimg.cn/wap180/76102133ly8fvlyn5n52gj20v90v949a.jpg', + 'http://wx2.sinaimg.cn/wap180/76102133ly8fk0btnrn5zj20dp0e8q3t.jpg' + ]) diff --git a/tests/testdata/b541fd1751117498b6d6f40d3321686ddf871651237c4ac854a5c3eb.html b/tests/testdata/b541fd1751117498b6d6f40d3321686ddf871651237c4ac854a5c3eb.html new file mode 100644 index 00000000..30936fb8 --- /dev/null +++ b/tests/testdata/b541fd1751117498b6d6f40d3321686ddf871651237c4ac854a5c3eb.html @@ -0,0 +1 @@ +专辑:头像相册
专辑:头像相册
照片墙|传统列表
TOP
\ No newline at end of file diff --git a/tests/testdata/url_map.json b/tests/testdata/url_map.json index 2aca0433..22be7207 100644 --- a/tests/testdata/url_map.json +++ b/tests/testdata/url_map.json @@ -8,5 +8,6 @@ "https://weibo.cn/1669879400?page=3": "tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html", "https://weibo.cn/mblog/picAll/J3xfm61AZ?rl=1": "tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html", "https://weibo.cn/comment/J5cVGuUNq": "tests/testdata/4d5ed0a3ebd0303cb45edd544dbc0ab5e86d43e103405f0c60515884.html", - "https://weibo.cn/1980768563/photo?tf=6_008": "tests/testdata/e4d541ecb02253c14abc1d52605fc00d91279df9ac4c1465c85b91b3.html" + "https://weibo.cn/1980768563/photo?tf=6_008": "tests/testdata/e4d541ecb02253c14abc1d52605fc00d91279df9ac4c1465c85b91b3.html", + "https://weibo.cn/album/166564740000001980768563?rl=1": "tests/testdata/b541fd1751117498b6d6f40d3321686ddf871651237c4ac854a5c3eb.html" } diff --git a/weibo_spider/parser/album_parser.py b/weibo_spider/parser/album_parser.py new file mode 100644 index 00000000..c187b9df --- /dev/null +++ b/weibo_spider/parser/album_parser.py @@ -0,0 +1,13 @@ +from .util import handle_html +from .parser import Parser + + +class AlbumParser(Parser): + def __init__(self, cookie, album_url): + self.cookie = cookie + self.url = album_url + self.selector = handle_html(self.cookie, self.url) + + def extract_pic_urls(self): + # + return self.selector.xpath('//img[@class="c"]/@src') \ No newline at end of file From 52cbbafb6d99296dfc6b3a0de05e2ff6918712e3 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Thu, 19 Aug 2021 21:33:25 +0800 Subject: [PATCH 332/363] Feature: download avatar pictures. --- weibo_spider/downloader/__init__.py | 6 ++++- .../downloader/avatar_picture_downloader.py | 22 +++++++++++++++++++ weibo_spider/parser/__init__.py | 4 +++- weibo_spider/spider.py | 17 +++++++++++++- 4 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 weibo_spider/downloader/avatar_picture_downloader.py diff --git a/weibo_spider/downloader/__init__.py b/weibo_spider/downloader/__init__.py index d573f9f6..53e9dfdf 100644 --- a/weibo_spider/downloader/__init__.py +++ b/weibo_spider/downloader/__init__.py @@ -1,5 +1,9 @@ from .origin_picture_downloader import OriginPictureDownloader from .retweet_picture_downloader import RetweetPictureDownloader +from .avatar_picture_downloader import AvatarPictureDownloader from .video_downloader import VideoDownloader -__all__ = [OriginPictureDownloader, RetweetPictureDownloader, VideoDownloader] +__all__ = [ + OriginPictureDownloader, RetweetPictureDownloader, AvatarPictureDownloader, + VideoDownloader +] diff --git a/weibo_spider/downloader/avatar_picture_downloader.py b/weibo_spider/downloader/avatar_picture_downloader.py new file mode 100644 index 00000000..e7a64e82 --- /dev/null +++ b/weibo_spider/downloader/avatar_picture_downloader.py @@ -0,0 +1,22 @@ +import os + +from .img_downloader import ImgDownloader + + +class AvatarPictureDownloader(ImgDownloader): + def __init__(self, file_dir, file_download_timeout): + super().__init__(file_dir, file_download_timeout) + self.describe = u'头像图片' + self.key = 'avatar_pictures' + + def handle_download(self, urls): + """处理下载相关操作""" + file_dir = self.file_dir + os.sep + self.describe + if not os.path.isdir(file_dir): + os.makedirs(file_dir) + + for i, url in enumerate(urls): + index = url.rfind('/') + file_name = url[index:] + file_path = file_dir + os.sep + file_name + self.download_one_file(url, file_path, 'xxx') \ No newline at end of file diff --git a/weibo_spider/parser/__init__.py b/weibo_spider/parser/__init__.py index 46f81f0e..27af1f76 100644 --- a/weibo_spider/parser/__init__.py +++ b/weibo_spider/parser/__init__.py @@ -1,4 +1,6 @@ from .index_parser import IndexParser from .page_parser import PageParser +from .photo_parser import PhotoParser +from .album_parser import AlbumParser -__all__ = [IndexParser, PageParser] +__all__ = [IndexParser, PageParser, PhotoParser, AlbumParser] diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 974ee998..85733c54 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -15,7 +15,8 @@ from tqdm import tqdm from . import config_util, datetime_util -from .parser import IndexParser, PageParser +from .downloader import AvatarPictureDownloader +from .parser import IndexParser, PageParser, PhotoParser, AlbumParser from .user import User FLAGS = flags.FLAGS @@ -140,6 +141,16 @@ def get_user_info(self, user_uri): self.user = IndexParser(self.cookie, user_uri).get_user() self.page_count += 1 + def download_user_avatar(self, user_uri): + """下载用户头像""" + avatar_album_url = PhotoParser(self.cookie, + user_uri).extract_avatar_album_url() + pic_urls = AlbumParser(self.cookie, + avatar_album_url).extract_pic_urls() + AvatarPictureDownloader( + self._get_filepath('img'), + self.file_download_timeout).handle_download(pic_urls) + def get_weibo_info(self): """获取微博信息""" try: @@ -302,6 +313,10 @@ def get_one_user(self, user_config): self.write_user(self.user) logger.info('*' * 100) + # 下载用户头像相册中的图片。 + if self.pic_download: + self.download_user_avatar(user_config['user_uri']) + for weibos in self.get_weibo_info(): self.write_weibo(weibos) self.got_num += len(weibos) From 5d6fc80f9ea4bc7cb68bf00fbffb7b602889d1c0 Mon Sep 17 00:00:00 2001 From: dataabc Date: Thu, 2 Sep 2021 00:52:06 +0800 Subject: [PATCH 333/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E9=83=A8?= =?UTF-8?q?=E5=88=86=E5=BE=AE=E5=8D=9A=E7=BC=BA=E5=A4=B1=E7=9A=84=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #374 --- setup.py | 2 +- weibo_spider/parser/page_parser.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ba026a42..498354f8 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='weibo-spider', - version='0.2.5', + version='0.2.6', author='Chen Lei', author_email='chillychen1991@gmail.com', description='新浪微博爬虫,用python爬取新浪微博数据。', diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index e05335ef..d57d8565 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -62,7 +62,7 @@ def get_one_page(self, weibo_id_list): weibos = [] if is_exist: since_date = datetime_util.str_to_time(self.since_date) - for i in range(0, len(info) - 2): + for i in range(0, len(info) - 1): weibo = self.get_one_weibo(info[i]) if weibo: if weibo.id in weibo_id_list: From fc4cdf907994b2dd38291549fae518bada5def39 Mon Sep 17 00:00:00 2001 From: dataabc Date: Wed, 13 Oct 2021 19:16:27 +0800 Subject: [PATCH 334/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E9=83=A8?= =?UTF-8?q?=E5=88=86=E6=95=B0=E5=AD=97=E8=BD=AC=E6=8D=A2=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #387 --- setup.py | 2 +- weibo_spider/parser/index_parser.py | 8 +++---- weibo_spider/parser/util.py | 33 +++++++++++++++++------------ weibo_spider/spider.py | 2 +- 4 files changed, 26 insertions(+), 19 deletions(-) diff --git a/setup.py b/setup.py index 498354f8..631503b2 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='weibo-spider', - version='0.2.6', + version='0.2.7', author='Chen Lei', author_email='chillychen1991@gmail.com', description='新浪微博爬虫,用python爬取新浪微博数据。', diff --git a/weibo_spider/parser/index_parser.py b/weibo_spider/parser/index_parser.py index 2a82bb1a..caab1286 100644 --- a/weibo_spider/parser/index_parser.py +++ b/weibo_spider/parser/index_parser.py @@ -2,7 +2,7 @@ from .info_parser import InfoParser from .parser import Parser -from .util import handle_html +from .util import handle_html, string_to_int logger = logging.getLogger('spider.index_parser') @@ -36,9 +36,9 @@ def get_user(self): self.user.id = user_id user_info = self.selector.xpath("//div[@class='tip2']/*/text()") - self.user.weibo_num = int(user_info[0][3:-1]) - self.user.following = int(user_info[1][3:-1]) - self.user.followers = int(user_info[2][3:-1]) + self.user.weibo_num = string_to_int(user_info[0][3:-1]) + self.user.following = string_to_int(user_info[1][3:-1]) + self.user.followers = string_to_int(user_info[2][3:-1]) return self.user except Exception as e: logger.exception(e) diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py index a763e96a..8e03f6f2 100644 --- a/weibo_spider/parser/util.py +++ b/weibo_spider/parser/util.py @@ -1,7 +1,7 @@ import hashlib +import json import logging import sys -import json import requests from lxml import etree @@ -87,24 +87,31 @@ def to_video_download_url(cookie, video_page_url): if video_page_url == '': return '' - video_object_url = video_page_url.replace( - 'm.weibo.cn/s/video/show', 'm.weibo.cn/s/video/object') + video_object_url = video_page_url.replace('m.weibo.cn/s/video/show', + 'm.weibo.cn/s/video/object') try: user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' - headers = { - 'User_Agent': user_agent, - 'Cookie': cookie - } - wb_info = requests.get(video_object_url, - headers=headers).json() - video_url = wb_info['data']['object']['stream'].get( - 'hd_url') + headers = {'User_Agent': user_agent, 'Cookie': cookie} + wb_info = requests.get(video_object_url, headers=headers).json() + video_url = wb_info['data']['object']['stream'].get('hd_url') if not video_url: - video_url = wb_info['data']['object']['stream'][ - 'url'] + video_url = wb_info['data']['object']['stream']['url'] if not video_url: # 说明该视频为直播 video_url = '' except json.decoder.JSONDecodeError: logger.warning(u'当前账号没有浏览该视频的权限') return video_url + + +def string_to_int(string): + """字符串转换为整数""" + if isinstance(string, int): + return string + elif string.endswith(u'万+'): + string = string[:-2] + '0000' + elif string.endswith(u'万'): + string = float(string[:-1]) * 10000 + elif string.endswith(u'亿'): + string = float(string[:-1]) * 100000000 + return int(string) diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 85733c54..3ac60b35 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -16,7 +16,7 @@ from . import config_util, datetime_util from .downloader import AvatarPictureDownloader -from .parser import IndexParser, PageParser, PhotoParser, AlbumParser +from .parser import AlbumParser, IndexParser, PageParser, PhotoParser from .user import User FLAGS = flags.FLAGS From d1080ce3772727ef027470150621f63151021c62 Mon Sep 17 00:00:00 2001 From: dataabc Date: Mon, 1 Nov 2021 19:01:13 +0800 Subject: [PATCH 335/363] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E9=83=A8?= =?UTF-8?q?=E5=88=86=E5=BE=AE=E5=8D=9A=E6=97=A0=E6=B3=95=E8=8E=B7=E5=8F=96?= =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #374 --- setup.py | 2 +- weibo_spider/parser/album_parser.py | 9 +++++++-- weibo_spider/parser/photo_parser.py | 8 ++++++-- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 631503b2..e76d2d0b 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='weibo-spider', - version='0.2.7', + version='0.2.8', author='Chen Lei', author_email='chillychen1991@gmail.com', description='新浪微博爬虫,用python爬取新浪微博数据。', diff --git a/weibo_spider/parser/album_parser.py b/weibo_spider/parser/album_parser.py index c187b9df..546bb672 100644 --- a/weibo_spider/parser/album_parser.py +++ b/weibo_spider/parser/album_parser.py @@ -1,5 +1,5 @@ -from .util import handle_html from .parser import Parser +from .util import handle_html class AlbumParser(Parser): @@ -10,4 +10,9 @@ def __init__(self, cookie, album_url): def extract_pic_urls(self): # - return self.selector.xpath('//img[@class="c"]/@src') \ No newline at end of file + pic_list = self.selector.xpath('//div[@class="c"]//img/@src') + for i, pic in enumerate(pic_list): + if "?" in pic: + pic = pic[:pic.index("?")] + pic_list[i] = pic + return pic_list diff --git a/weibo_spider/parser/photo_parser.py b/weibo_spider/parser/photo_parser.py index 33551c7e..236e76e2 100644 --- a/weibo_spider/parser/photo_parser.py +++ b/weibo_spider/parser/photo_parser.py @@ -1,5 +1,5 @@ -from .util import handle_html from .parser import Parser +from .util import handle_html class PhotoParser(Parser): @@ -7,9 +7,13 @@ def __init__(self, cookie, user_id): self.cookie = cookie self.url = "https://weibo.cn/" + str(user_id) + "/photo?tf=6_008" self.selector = handle_html(self.cookie, self.url) + self.user_id = user_id def extract_avatar_album_url(self): # Finds the href attribute of the table td div element with text 头像相册, e.g. # 头像相册 result = self.selector.xpath('//img[@alt="头像相册"]/../@href') - return "https://weibo.cn" + result[0] + if len(result) > 0: + return "https://weibo.cn" + result[0] + else: + return "https://weibo.cn/" + str(self.user_id) + "/avatar?rl=0" From c106ffa1b8b51e4ecd408b661d092da07896a94b Mon Sep 17 00:00:00 2001 From: minami9 Date: Mon, 13 Dec 2021 16:41:00 +0800 Subject: [PATCH 336/363] fix: uid=1,crash --- weibo_spider/parser/info_parser.py | 9 ++++----- weibo_spider/parser/util.py | 2 ++ 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/weibo_spider/parser/info_parser.py b/weibo_spider/parser/info_parser.py index 6928c7c2..90164554 100644 --- a/weibo_spider/parser/info_parser.py +++ b/weibo_spider/parser/info_parser.py @@ -35,9 +35,9 @@ def extract_user_info(self): if i.split(':', 1)[0] in zh_list: setattr(user, en_list[zh_list.index(i.split(':', 1)[0])], i.split(':', 1)[1].replace('\u3000', '')) - - if self.selector.xpath( - "//div[@class='tip'][2]/text()")[0] == u'学习经历': + + experienced = self.selector.xpath("//div[@class='tip'][2]/text()") + if experienced and experienced[0] == u'学习经历': user.education = self.selector.xpath( "//div[@class='c'][4]/text()")[0][1:].replace( u'\xa0', u' ') @@ -46,8 +46,7 @@ def extract_user_info(self): user.work = self.selector.xpath( "//div[@class='c'][5]/text()")[0][1:].replace( u'\xa0', u' ') - elif self.selector.xpath( - "//div[@class='tip'][2]/text()")[0] == u'工作经历': + elif experienced and experienced[0] == u'工作经历': user.work = self.selector.xpath( "//div[@class='c'][4]/text()")[0][1:].replace( u'\xa0', u' ') diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py index 8e03f6f2..6ce6be70 100644 --- a/weibo_spider/parser/util.py +++ b/weibo_spider/parser/util.py @@ -106,6 +106,8 @@ def to_video_download_url(cookie, video_page_url): def string_to_int(string): """字符串转换为整数""" + if len(string) == 0: + return 0 if isinstance(string, int): return string elif string.endswith(u'万+'): From d2589ddf2860d83183f246495100323eba25c870 Mon Sep 17 00:00:00 2001 From: minami9 Date: Mon, 13 Dec 2021 16:52:22 +0800 Subject: [PATCH 337/363] add: warning when stringtoint get a empty string --- weibo_spider/parser/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py index 6ce6be70..82e0f10c 100644 --- a/weibo_spider/parser/util.py +++ b/weibo_spider/parser/util.py @@ -107,6 +107,7 @@ def to_video_download_url(cookie, video_page_url): def string_to_int(string): """字符串转换为整数""" if len(string) == 0: + print("Warning: the input string is empty!") return 0 if isinstance(string, int): return string From 6db1a2c13f9ac9a9e1f94ce43415f43d8bb19bcc Mon Sep 17 00:00:00 2001 From: minami9 Date: Tue, 14 Dec 2021 00:19:08 +0800 Subject: [PATCH 338/363] add: use logger replace print in function string_to_int --- weibo_spider/parser/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py index 82e0f10c..3169f24e 100644 --- a/weibo_spider/parser/util.py +++ b/weibo_spider/parser/util.py @@ -107,7 +107,7 @@ def to_video_download_url(cookie, video_page_url): def string_to_int(string): """字符串转换为整数""" if len(string) == 0: - print("Warning: the input string is empty!") + logger.warning("string to int, the input string is empty!") return 0 if isinstance(string, int): return string From 1061de0a64df53e74bc06dd3df36b3620e431cc7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 Dec 2021 20:06:15 +0000 Subject: [PATCH 339/363] build(deps): bump lxml from 4.6.3 to 4.6.5 Bumps [lxml](https://github.com/lxml/lxml) from 4.6.3 to 4.6.5. - [Release notes](https://github.com/lxml/lxml/releases) - [Changelog](https://github.com/lxml/lxml/blob/master/CHANGES.txt) - [Commits](https://github.com/lxml/lxml/compare/lxml-4.6.3...lxml-4.6.5) --- updated-dependencies: - dependency-name: lxml dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 59c9e88f..e1e0117a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -lxml==4.6.3 +lxml==4.6.5 requests==2.23.0 tqdm==4.46.1 absl-py==0.9.0 \ No newline at end of file From 77977c22367da0c54bef61a28f9bf1aaf17f4108 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Sat, 19 Mar 2022 22:46:04 +0800 Subject: [PATCH 340/363] Add a bibtex item for the weiboSpider repo. --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 8c4a6404..653ce8aa 100644 --- a/README.md +++ b/README.md @@ -244,6 +244,17 @@ $ python3 -m weibo_spider --u="1669879400,1223178222" 本项目通过获取微博数据,为写论文、做研究等非商业项目提供所需数据。[学术研究文档](https://github.com/dataabc/weiboSpider/blob/master/docs/academic.md)是一些在论文或研究等方面使用过本程序的项目,这些项目展示已征得所有者同意。在一些涉及隐私的描述上,已与所有者做了沟通,描述中只介绍所有者允许展示的部分。如果部分信息所有者之前同意展示并且已经写在了文档中,现在又不想展示了,可以通过邮件(chillychen1991@gmail.com)或issue的方式告诉我,我会删除相关信息。同时,也欢迎使用本项目写论文或做其它学术研究的朋友,将自己的研究成果展示在[学术研究文档](https://github.com/dataabc/weiboSpider/blob/master/docs/academic.md)里,这完全是自愿的。 +为方便大家引用,现提供该项目的 bibtex 条目如下: + +``` +@misc{weibospider2020, + author = {CHEN Lei, SONG Zhengyang, schaepher, minami9, bluerthanever, MKSP2015, moqimoqidea, windlively, eggachecat, mtuwei, codermino, duangan1}, + title = {{Weibo Spider}}, + howpublished = {\url{https://github.com/dataabc/weiboSpider}}, + year = {2020} +} +``` + ## 相关项目 - [weibo-crawler](https://github.com/dataabc/weibo-crawler) - 功能和本项目完全一样,可以不添加cookie,获取的微博属性更多; From 240cdef1dac8a937a09472fbb6cdc6c141f68f1d Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Sun, 20 Mar 2022 00:51:14 +0800 Subject: [PATCH 341/363] Minor fixes of README based on review comments. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 653ce8aa..7da707fa 100644 --- a/README.md +++ b/README.md @@ -244,11 +244,11 @@ $ python3 -m weibo_spider --u="1669879400,1223178222" 本项目通过获取微博数据,为写论文、做研究等非商业项目提供所需数据。[学术研究文档](https://github.com/dataabc/weiboSpider/blob/master/docs/academic.md)是一些在论文或研究等方面使用过本程序的项目,这些项目展示已征得所有者同意。在一些涉及隐私的描述上,已与所有者做了沟通,描述中只介绍所有者允许展示的部分。如果部分信息所有者之前同意展示并且已经写在了文档中,现在又不想展示了,可以通过邮件(chillychen1991@gmail.com)或issue的方式告诉我,我会删除相关信息。同时,也欢迎使用本项目写论文或做其它学术研究的朋友,将自己的研究成果展示在[学术研究文档](https://github.com/dataabc/weiboSpider/blob/master/docs/academic.md)里,这完全是自愿的。 -为方便大家引用,现提供该项目的 bibtex 条目如下: +为方便大家引用,现提供本项目的 bibtex 条目如下: ``` @misc{weibospider2020, - author = {CHEN Lei, SONG Zhengyang, schaepher, minami9, bluerthanever, MKSP2015, moqimoqidea, windlively, eggachecat, mtuwei, codermino, duangan1}, + author = {Lei Chen, Zhengyang Song, schaepher, minami9, bluerthanever, MKSP2015, moqimoqidea, windlively, eggachecat, mtuwei, codermino, duangan1}, title = {{Weibo Spider}}, howpublished = {\url{https://github.com/dataabc/weiboSpider}}, year = {2020} From 1f560db00cb402250c111dcae91bc7f8eddd98e9 Mon Sep 17 00:00:00 2001 From: caixiangyue Date: Fri, 13 May 2022 16:25:51 +0800 Subject: [PATCH 342/363] fix: info maybe cause list index out of range --- weibo_spider/parser/page_parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index d57d8565..9c3aec37 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -43,6 +43,8 @@ def __init__(self, cookie, user_config, page, filter): for i in range(3): self.selector = handle_html(self.cookie, self.url) info = self.selector.xpath("//div[@class='c']") + if info is None or len(info) == 0: + continue is_exist = info[0].xpath("div/span[@class='ctt']") if is_exist: PageParser.empty_count = 0 From cda3ae47aee498828327b47b1593cfb0c5253a3b Mon Sep 17 00:00:00 2001 From: KimiDing Date: Thu, 19 May 2022 17:54:04 +0800 Subject: [PATCH 343/363] add mongo_config --- weibo_spider/config_sample.json | 7 ++++++- weibo_spider/spider.py | 3 ++- weibo_spider/writer/mongo_writer.py | 12 +++++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/weibo_spider/config_sample.json b/weibo_spider/config_sample.json index b2652dfa..6d53cb99 100644 --- a/weibo_spider/config_sample.json +++ b/weibo_spider/config_sample.json @@ -24,5 +24,10 @@ "weibo_topics": ["spider_weibo"], "user_topics": ["spider_weibo"] }, - "sqlite_config": "weibo.db" + "sqlite_config": "weibo.db", + "mongo_config": { + "connection_string": "mongodb://test:testpwd@localhost:27017/weibo", + "dba_name": "admin", + "dba_password": "password" + } } diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index 3ac60b35..dabd6f77 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -73,6 +73,7 @@ def __init__(self, config): self.sqlite_config = config.get('sqlite_config') self.kafka_config = config.get('kafka_config') + self.mongo_config = config.get('mongo_config') self.user_config_file_path = '' user_id_list = config['user_id_list'] if FLAGS.user_id_list: @@ -272,7 +273,7 @@ def initialize_info(self, user_config): if 'mongo' in self.write_mode: from .writer import MongoWriter - self.writers.append(MongoWriter()) + self.writers.append(MongoWriter(self.mongo_config)) if 'sqlite' in self.write_mode: from .writer import SqliteWriter diff --git a/weibo_spider/writer/mongo_writer.py b/weibo_spider/writer/mongo_writer.py index 6a76e687..8fc586fb 100644 --- a/weibo_spider/writer/mongo_writer.py +++ b/weibo_spider/writer/mongo_writer.py @@ -8,8 +8,11 @@ class MongoWriter(Writer): - def __init__(self): - pass + def __init__(self, mongo_config): + self.mongo_config = mongo_config + self.connection_string = mongo_config['connection_string'] + self.dba_name = mongo_config['dba_name'] + self.dba_password = mongo_config['dba_password'] def _info_to_mongodb(self, collection, info_list): """将爬取的信息写入MongoDB数据库""" @@ -22,7 +25,10 @@ def _info_to_mongodb(self, collection, info_list): try: from pymongo import MongoClient - client = MongoClient() + client = MongoClient(self.connection_string) + if not self.dba_name.isspace() or not self.dba_password.isspace(): + client.admin.authenticate(self.dba_name,self.dba_password,mechanism='SCRAM-SHA-1') + db = client['weibo'] collection = db[collection] new_info_list = copy.deepcopy(info_list) From b2243381fcc6920e09d540853560ce2ed16ec151 Mon Sep 17 00:00:00 2001 From: qianxin Date: Mon, 20 Jun 2022 11:23:24 +0800 Subject: [PATCH 344/363] =?UTF-8?q?=E4=BF=AE=E5=A4=8D:FAQ=E4=B8=AD?= =?UTF-8?q?=E8=8E=B7=E5=8F=96=E8=87=AA=E5=B7=B1=E5=BE=AE=E5=8D=9A=E6=8F=8F?= =?UTF-8?q?=E8=BF=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/FAQ.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index c25a1938..14e41657 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -41,5 +41,5 @@ python3 -m weibo_spider 修改page_parser.py中__init__方法,将self.url修改为: ```python - self.url = "https://weibo.cn/%s/profile?page=%d" % (user_uri, page) + self.url = "https://weibo.cn/%s/profile?page=%d" % (self.user_uri, page) ``` From 62a00294e3719fdcc0f5c6fbc72c96e64fc325ce Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 6 Jul 2022 20:12:46 +0000 Subject: [PATCH 345/363] build(deps): bump lxml from 4.6.5 to 4.9.1 Bumps [lxml](https://github.com/lxml/lxml) from 4.6.5 to 4.9.1. - [Release notes](https://github.com/lxml/lxml/releases) - [Changelog](https://github.com/lxml/lxml/blob/master/CHANGES.txt) - [Commits](https://github.com/lxml/lxml/compare/lxml-4.6.5...lxml-4.9.1) --- updated-dependencies: - dependency-name: lxml dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e1e0117a..5f3568fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -lxml==4.6.5 +lxml==4.9.1 requests==2.23.0 tqdm==4.46.1 absl-py==0.9.0 \ No newline at end of file From 95ab514caf59330e925af3df552538d285799e70 Mon Sep 17 00:00:00 2001 From: jerrylaikr Date: Fri, 29 Jul 2022 17:20:33 +0000 Subject: [PATCH 346/363] =?UTF-8?q?=E5=8A=A0=E5=85=A5MongoDB=E8=AE=BE?= =?UTF-8?q?=E7=BD=AE=E7=9A=84=E8=AF=B4=E6=98=8E=E5=92=8C=E4=BE=8B=E5=AD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/settings.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/settings.md b/docs/settings.md index 9b89ae93..41fa065f 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -178,6 +178,28 @@ $ pip install pymysql ```bash $ pip install pymongo ``` +connection_string是MongoDB标准URI: +```text +mongodb://[username:password@]host1[:port1][,...hostN[:portN]][/[defaultauthdb][?options]] +``` + +dba_name和dba_password对应URI中的username和password。如果没有访问限制可不填。 +无访问限制的例子: +```json +"connection_string": "mongodb://localhost:27017/weibo", +``` +使用用户名和密码的例子: +```json +"connection_string": "mongodb://admin:password@localhost:27017/weibo", +"dba_name": "", +"dba_password": "", +``` +或 +```json +"connection_string": "mongodb://localhost:27017/weibo", +"dba_name": "admin", +"dba_password": "password", +``` MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为"weibo"的数据库,然后再创建"user"表和"weibo"表,包含爬取的所有内容。爬取到的微博**用户信息**或插入或更新,都会存储到user表里;爬取到的**微博信息**或插入或更新,都会存储到weibo表里,两个表通过user_id关联。如果想了解两个表的具体字段,请点击"详情"。 From 999b66d6095db449552e95711edb008ebb4df639 Mon Sep 17 00:00:00 2001 From: jerrylaikr Date: Fri, 29 Jul 2022 17:21:38 +0000 Subject: [PATCH 347/363] =?UTF-8?q?=E6=9B=B4=E6=96=B0MongoDB=E8=AE=BE?= =?UTF-8?q?=E7=BD=AE=E5=AF=B9=E7=A9=BA=E5=AD=97=E7=AC=A6=E4=B8=B2=E7=9A=84?= =?UTF-8?q?=E8=AF=86=E5=88=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/writer/mongo_writer.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/weibo_spider/writer/mongo_writer.py b/weibo_spider/writer/mongo_writer.py index 8fc586fb..c6c08c5b 100644 --- a/weibo_spider/writer/mongo_writer.py +++ b/weibo_spider/writer/mongo_writer.py @@ -11,8 +11,8 @@ class MongoWriter(Writer): def __init__(self, mongo_config): self.mongo_config = mongo_config self.connection_string = mongo_config['connection_string'] - self.dba_name = mongo_config['dba_name'] - self.dba_password = mongo_config['dba_password'] + self.dba_name = mongo_config.get('dba_name', None) + self.dba_password = mongo_config.get('dba_password', None) def _info_to_mongodb(self, collection, info_list): """将爬取的信息写入MongoDB数据库""" @@ -26,8 +26,11 @@ def _info_to_mongodb(self, collection, info_list): from pymongo import MongoClient client = MongoClient(self.connection_string) - if not self.dba_name.isspace() or not self.dba_password.isspace(): - client.admin.authenticate(self.dba_name,self.dba_password,mechanism='SCRAM-SHA-1') + if self.dba_name or self.dba_password: + # authenticate() 在PyMongo3.6版本就已弃用,这一段可能需要后续跟进 + client.admin.authenticate( + self.dba_name, self.dba_password, mechanism='SCRAM-SHA-1' + ) db = client['weibo'] collection = db[collection] From 1e879cf6df712b3486d1f5ea0cc79d4a96d39662 Mon Sep 17 00:00:00 2001 From: jerrylaikr Date: Fri, 29 Jul 2022 17:29:02 +0000 Subject: [PATCH 348/363] update mongo_config --- weibo_spider/config_sample.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/weibo_spider/config_sample.json b/weibo_spider/config_sample.json index 6d53cb99..330e2a10 100644 --- a/weibo_spider/config_sample.json +++ b/weibo_spider/config_sample.json @@ -26,8 +26,8 @@ }, "sqlite_config": "weibo.db", "mongo_config": { - "connection_string": "mongodb://test:testpwd@localhost:27017/weibo", - "dba_name": "admin", - "dba_password": "password" + "connection_string": "mongodb://admin:password@localhost:27017/weibo", + "dba_name": "", + "dba_password": "" } } From a51e4a84e8da2c49f6643a31c48ee594fe36e376 Mon Sep 17 00:00:00 2001 From: unknown <518984@qq.com> Date: Sun, 16 Oct 2022 22:17:26 +0800 Subject: [PATCH 349/363] correct user's profile url when fetching weibo Issue#482 --- weibo_spider/parser/index_parser.py | 2 +- weibo_spider/parser/page_parser.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/weibo_spider/parser/index_parser.py b/weibo_spider/parser/index_parser.py index caab1286..0c5f5e4a 100644 --- a/weibo_spider/parser/index_parser.py +++ b/weibo_spider/parser/index_parser.py @@ -11,7 +11,7 @@ class IndexParser(Parser): def __init__(self, cookie, user_uri): self.cookie = cookie self.user_uri = user_uri - self.url = 'https://weibo.cn/%s' % (user_uri) + self.url = 'https://weibo.cn/%s/profile' % (user_uri) self.selector = handle_html(self.cookie, self.url) def _get_user_id(self): diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index 9c3aec37..cbee1525 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -25,7 +25,7 @@ def __init__(self, cookie, user_config, page, filter): self.since_date = user_config['since_date'] self.end_date = user_config['end_date'] self.page = page - self.url = 'https://weibo.cn/%s?page=%d' % (self.user_uri, page) + self.url = 'https://weibo.cn/%s/profile?page=%d' % (self.user_uri, page) if self.end_date != 'now': since_date = self.since_date.split(' ')[0].split('-') end_date = self.end_date.split(' ')[0].split('-') From 2a1400b39884e310d762320683cec3be012ea0b6 Mon Sep 17 00:00:00 2001 From: linbuxiao Date: Wed, 16 Nov 2022 18:45:34 +0800 Subject: [PATCH 350/363] perf: update absl-py lock version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5f3568fe..033e1705 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ lxml==4.9.1 requests==2.23.0 tqdm==4.46.1 -absl-py==0.9.0 \ No newline at end of file +absl-py==0.12.0 \ No newline at end of file From 6d4f5f9e8bc7b5b9ddee00f0728b8c03a36fff65 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Fri, 19 May 2023 22:16:36 +0800 Subject: [PATCH 351/363] Fix #484: run correctly when there are 2 pinned weibo. --- weibo_spider/parser/page_parser.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index cbee1525..a2c5900b 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -10,6 +10,8 @@ from .parser import Parser from .util import handle_garbled, handle_html, to_video_download_url +MAX_PINNED_COUNT = 2 + logger = logging.getLogger('spider.page_parser') @@ -58,6 +60,7 @@ def __init__(self, cookie, user_config, page, filter): def get_one_page(self, weibo_id_list): """获取第page页的全部微博""" + cur_pinned_count = 0 try: info = self.selector.xpath("//div[@class='c']") is_exist = info[0].xpath("div/span[@class='ctt']") @@ -72,8 +75,11 @@ def get_one_page(self, weibo_id_list): publish_time = datetime_util.str_to_time( weibo.publish_time) - if publish_time < since_date: - if self.is_pinned_weibo(info[i]): + if publish_time < since_date: + # As of 2023.05, there can be at most 2 pinned weibo. + # We will continue for at most 2 times before return. + if self.page == 1 and cur_pinned_count < MAX_PINNED_COUNT: + cur_pinned_count += 1 continue else: return weibos, weibo_id_list, False @@ -301,14 +307,6 @@ def get_video_url(self, info): return video_url - def is_pinned_weibo(self, info): - """判断微博是否为置顶微博""" - kt = info.xpath(".//span[@class='kt']/text()") - if kt and kt[0] == u'置顶': - return True - else: - return False - def get_one_weibo(self, info): """获取一条微博的全部信息""" try: From 55ddb32929f0d47af9b544f639b4849fbf09f48e Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Fri, 19 May 2023 22:51:46 +0800 Subject: [PATCH 352/363] Fix: fix pytest by fixing the url map of testdata. The new url format are introduced by https://github.com/dataabc/weiboSpider/commit/a51e4a84e8da2c49f6643a31c48ee594fe36e376 --- tests/testdata/url_map.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/testdata/url_map.json b/tests/testdata/url_map.json index 22be7207..39cbd80e 100644 --- a/tests/testdata/url_map.json +++ b/tests/testdata/url_map.json @@ -1,11 +1,11 @@ { - "https://weibo.cn/1669879400": "tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html", + "https://weibo.cn/1669879400/profile": "tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html", "https://weibo.cn/1669879400/info": "tests/testdata/ca5f2a555e8d62f728c66fa90afb2d54d19f8c898e164204a61bdf03.html", - "https://weibo.cn/1669879400?page=1": "tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html", + "https://weibo.cn/1669879400/profile?page=1": "tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html", "https://weibo.cn/mblog/picAll/J6k49kbTc?rl=1": "tests/testdata/e97222acd5bc7d8d1bfbd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html", "https://weibo.cn/mblog/picAll/J5ZcSnCAg?rl=1": "tests/testdata/63a98849ec82b2c87ec55bca03cbf5988f7eac233a23d86b4fdf5ffd.html", - "https://weibo.cn/1669879400?page=2": "tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html", - "https://weibo.cn/1669879400?page=3": "tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html", + "https://weibo.cn/1669879400/profile?page=2": "tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html", + "https://weibo.cn/1669879400/profile?page=3": "tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html", "https://weibo.cn/mblog/picAll/J3xfm61AZ?rl=1": "tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html", "https://weibo.cn/comment/J5cVGuUNq": "tests/testdata/4d5ed0a3ebd0303cb45edd544dbc0ab5e86d43e103405f0c60515884.html", "https://weibo.cn/1980768563/photo?tf=6_008": "tests/testdata/e4d541ecb02253c14abc1d52605fc00d91279df9ac4c1465c85b91b3.html", From 3fc644004464079d57966e41f72951c271ad95d2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 22 May 2023 23:19:47 +0000 Subject: [PATCH 353/363] build(deps): bump requests from 2.23.0 to 2.31.0 Bumps [requests](https://github.com/psf/requests) from 2.23.0 to 2.31.0. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.23.0...v2.31.0) --- updated-dependencies: - dependency-name: requests dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 033e1705..68aaf6d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ lxml==4.9.1 -requests==2.23.0 +requests==2.31.0 tqdm==4.46.1 absl-py==0.12.0 \ No newline at end of file From 6ec688277b7b1784e3dc912417c183e68a62bdc5 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Fri, 28 Jul 2023 00:13:33 +0800 Subject: [PATCH 354/363] Update the stale action rule to not mark issues with assignees as stale. Reference: https://github.com/probot/stale --- .github/stale.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/stale.yml b/.github/stale.yml index cbbef7f8..9e5f5ca7 100644 --- a/.github/stale.yml +++ b/.github/stale.yml @@ -1,20 +1,30 @@ # Number of days of inactivity before an issue becomes stale daysUntilStale: 60 + # Number of days of inactivity before a stale issue is closed daysUntilClose: 7 + # Issues with these labels will never be considered stale exemptLabels: - pinned - security - to do + +# Set to true to ignore issues with an assignee +exemptAssignees: true + # Label to use when marking an issue as stale staleLabel: wontfix + # Comment to post when marking an issue as stale. Set to `false` to disable markComment: > This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions. + # Comment to post when closing a stale issue. Set to `false` to disable closeComment: > Closing as stale, please reopen if you'd like to work on this further. + +# Limit to only `issues` or `pulls` only: issues From 829a891c94aa525311d9979432a4b3c91e8fbad3 Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Sun, 27 Aug 2023 15:11:12 +0800 Subject: [PATCH 355/363] Fix the crawling of toutiao article urls. --- weibo_spider/parser/page_parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index a2c5900b..fcd17142 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -75,7 +75,7 @@ def get_one_page(self, weibo_id_list): publish_time = datetime_util.str_to_time( weibo.publish_time) - if publish_time < since_date: + if publish_time < since_date: # As of 2023.05, there can be at most 2 pinned weibo. # We will continue for at most 2 times before return. if self.page == 1 and cur_pinned_count < MAX_PINNED_COUNT: @@ -158,9 +158,9 @@ def get_article_url(self, info): """获取微博头条文章的url""" article_url = '' text = handle_garbled(info) - if text.startswith(u'发布了头条文章'): + if text.startswith(u'发布了头条文章') or text.startswith(u'我发表了头条文章'): url = info.xpath('.//a/@href') - if url and url[0].startswith('https://weibo.cn/sinaurl'): + if url and url[0].startswith('https://weibo.com/ttarticle'): article_url = url[0] return article_url From ac550e0a50e8da81c506f8696133f5037af4beb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BA=8C=E4=B9=94?= <605056080@qq.com.com> Date: Sat, 27 Apr 2024 18:03:18 +0800 Subject: [PATCH 356/363] =?UTF-8?q?issues=5Fbug=5F574=20=E6=97=A0=E6=B3=95?= =?UTF-8?q?=E5=8C=B9=E9=85=8D=E8=8E=B7=E5=8F=96=E5=BE=AE=E5=8D=9A=E9=95=BF?= =?UTF-8?q?=E6=96=87=EF=BC=8C=E5=B0=9D=E8=AF=95=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/parser/comment_parser.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/parser/comment_parser.py index 28e75e30..6e06c776 100644 --- a/weibo_spider/parser/comment_parser.py +++ b/weibo_spider/parser/comment_parser.py @@ -1,8 +1,11 @@ import logging import random import requests +import re from time import sleep - +from lxml.html import tostring +from lxml.html import fromstring +from lxml import etree from .parser import Parser from .util import handle_garbled, handle_html @@ -21,11 +24,17 @@ def get_long_weibo(self): for i in range(5): self.selector = handle_html(self.cookie, self.url) if self.selector is not None: - info = self.selector.xpath("//div[@class='c']")[1] - wb_content = handle_garbled(info) - wb_time = info.xpath("//span[@class='ct']/text()")[0] - weibo_content = wb_content[wb_content.find(':') + - 1:wb_content.rfind(wb_time)] + info_div = self.selector.xpath("//div[@class='c' and @id='M_']")[0] + info_span = info_div.xpath("//span[@class='ctt']")[0] + # 1. 获取 info_span 中的所有 HTML 代码作为字符串 + html_string = etree.tostring(info_span, encoding='unicode', method='html') + # 2. 将
替换为 \n + html_string = html_string.replace('
', '\n') + # 3. 去掉所有 HTML 标签,但保留标签内的有效文本 + new_content = fromstring(html_string).text_content() + # 4. 替换多个连续的 \n 为一个 \n + new_content = re.sub(r'\n+', '\n', new_content) + weibo_content = handle_garbled(new_content) if weibo_content is not None: return weibo_content sleep(random.randint(6, 10)) From 8c4eb7f8c2838b7ba67954e47913694e8a5f6507 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BA=8C=E4=B9=94?= <605056080@qq.com> Date: Sat, 27 Apr 2024 18:43:06 +0800 Subject: [PATCH 357/363] =?UTF-8?q?issues=5Fbug=5F574=20=E6=97=A0=E6=B3=95?= =?UTF-8?q?=E5=8C=B9=E9=85=8D=E8=8E=B7=E5=8F=96=E5=BE=AE=E5=8D=9A=E9=95=BF?= =?UTF-8?q?=E6=96=87=EF=BC=8C=E5=B0=9D=E8=AF=95=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/parser/util.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py index 3169f24e..81aa4297 100644 --- a/weibo_spider/parser/util.py +++ b/weibo_spider/parser/util.py @@ -48,8 +48,13 @@ def handle_html(cookie, url): def handle_garbled(info): """处理乱码""" try: - info = (info.xpath('string(.)').replace(u'\u200b', '').encode( - sys.stdout.encoding, 'ignore').decode(sys.stdout.encoding)) + if hasattr(info, 'xpath'): # 检查 info 是否具有 xpath 方法 + info_str = info.xpath('string(.)') # 提取字符串内容 + else: + info_str = str(info) # 若不支持 xpath,将其转换为字符串 + + info = info_str.replace(u'\u200b', '').encode( + sys.stdout.encoding, 'ignore').decode(sys.stdout.encoding) return info except Exception as e: logger.exception(e) From 2c7f72301844305357ce3c15783ea302780ba39c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BA=8C=E4=B9=94?= <605056080@qq.com> Date: Sun, 28 Apr 2024 11:12:12 +0800 Subject: [PATCH 358/363] =?UTF-8?q?issues=5Ffeature=5Fpost=5Fapi=5F576=20?= =?UTF-8?q?=E5=AE=9E=E7=8E=B0=E9=80=9A=E8=BF=87POST=E6=96=B9=E5=BC=8F?= =?UTF-8?q?=E5=B0=86=E6=95=B0=E6=8D=AE=E6=8E=A8=E9=80=81=E5=88=B0=E8=87=AA?= =?UTF-8?q?=E5=AE=9A=E4=B9=89=E6=8E=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/settings.md | 12 +++++++ weibo_spider/config_sample.json | 4 +++ weibo_spider/config_util.py | 4 +-- weibo_spider/spider.py | 6 ++++ weibo_spider/writer/__init__.py | 3 +- weibo_spider/writer/post_writer.py | 57 ++++++++++++++++++++++++++++++ 6 files changed, 83 insertions(+), 3 deletions(-) create mode 100644 weibo_spider/writer/post_writer.py diff --git a/docs/settings.md b/docs/settings.md index 41fa065f..03733670 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -239,3 +239,15 @@ MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为 - **publish_tool**:存储微博的发布工具。 + +## 设置API接口POST联动(可选) + +本部分是可选部分,如果不需要将爬取信息通过POST请求发送到指定API接口,可跳过这一步 + +请求数据格式为 `content-type : application/json`,接口响应返回也需要是 `content-type : application/json`,HTTP状态码为 `200` + +数据主体与 `write_mode` 配置的 `json` 输出格式一致,是整页获取数据json,每页POST发送一次 + +`api_url` 为指定的API接口地址 + +`api_token` 为接口鉴权TOKEN,将在 Request Headers 中添加 `api-token` 字段,根据需要配置 \ No newline at end of file diff --git a/weibo_spider/config_sample.json b/weibo_spider/config_sample.json index 330e2a10..262398d9 100644 --- a/weibo_spider/config_sample.json +++ b/weibo_spider/config_sample.json @@ -29,5 +29,9 @@ "connection_string": "mongodb://admin:password@localhost:27017/weibo", "dba_name": "", "dba_password": "" + }, + "post_config": { + "api_url": "", + "api_token": "" } } diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 55e4bdd8..ba4676b3 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -85,14 +85,14 @@ def validate_config(config): sys.exit() # 验证write_mode - write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql', 'sqlite', 'kafka'] + write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql', 'sqlite', 'kafka','post'] if not isinstance(config['write_mode'], list): logger.warning(u'write_mode值应为list类型') sys.exit() for mode in config['write_mode']: if mode not in write_mode: logger.warning( - u'%s为无效模式,请从txt、csv、json、mongo、sqlite, kafka和mysql中挑选一个或多个作为write_mode', + u'%s为无效模式,请从txt、csv、json、post、mongo、sqlite, kafka和mysql中挑选一个或多个作为write_mode', mode) sys.exit() diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index dabd6f77..e0d2e41e 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -74,6 +74,7 @@ def __init__(self, config): self.sqlite_config = config.get('sqlite_config') self.kafka_config = config.get('kafka_config') self.mongo_config = config.get('mongo_config') + self.post_config = config.get('post_config') self.user_config_file_path = '' user_id_list = config['user_id_list'] if FLAGS.user_id_list: @@ -284,6 +285,11 @@ def initialize_info(self, user_config): self.writers.append(KafkaWriter(self.kafka_config)) + if 'post' in self.write_mode: + from .writer import PostWriter + + self.writers.append(PostWriter(self.post_config)) + self.downloaders = [] if self.pic_download == 1: from .downloader import (OriginPictureDownloader, diff --git a/weibo_spider/writer/__init__.py b/weibo_spider/writer/__init__.py index 5868f1ac..f6b24bd6 100644 --- a/weibo_spider/writer/__init__.py +++ b/weibo_spider/writer/__init__.py @@ -5,5 +5,6 @@ from .txt_writer import TxtWriter from .sqlite_writer import SqliteWriter from .kafka_writer import KafkaWriter +from .post_writer import PostWriter -__all__ = [CsvWriter, TxtWriter, JsonWriter, MongoWriter, MySqlWriter, SqliteWriter, KafkaWriter] +__all__ = [CsvWriter, TxtWriter, JsonWriter, MongoWriter, MySqlWriter, SqliteWriter, KafkaWriter, PostWriter] diff --git a/weibo_spider/writer/post_writer.py b/weibo_spider/writer/post_writer.py new file mode 100644 index 00000000..7446fbea --- /dev/null +++ b/weibo_spider/writer/post_writer.py @@ -0,0 +1,57 @@ +import codecs +import json +import logging +import os +import requests + +from .writer import Writer + +logger = logging.getLogger('spider.post_writer') + +class PostWriter(Writer): + def __init__(self, post_config): + self.post_config = post_config + self.api_url = post_config['api_url'] + self.api_token = post_config.get('api_token', None) + self.dba_password = post_config.get('dba_password', None) + + def write_user(self, user): + self.user = user + + def _update_json_data(self, data, weibo_info): + """将获取到的微博数据转换为json输出模式一致""" + data['user'] = self.user.__dict__ + if data.get('weibo'): + data['weibo'] += weibo_info + else: + data['weibo'] = weibo_info + return data + + def send_post_request_with_token(self, url, data, token, max_retries, backoff_factor): + headers = { + 'Content-Type': 'application/json', + 'api-token': f'{token}', + } + for attempt in range(max_retries + 1): + try: + response = requests.post(url, json=data, headers=headers) + if response.status_code == requests.codes.ok: + return response.json() + else: + raise RequestException(f"Unexpected response status: {response.status_code}") + except RequestException as e: + if attempt < max_retries: + sleep(backoff_factor * (attempt + 1)) # 逐步增加等待时间,避免频繁重试 + continue + else: + logger.error(f"在尝试{max_retries}次发出POST连接后,请求失败:{e}") + + def write_weibo(self, weibos): + """将爬到的信息POST到API""" + data = {} + data = self._update_json_data(data, [w.__dict__ for w in weibos]) + if data: + self.send_post_request_with_token(self.api_url, data, self.api_token, 3, 2) + logger.info(u'%d条微博通过POST发送到 %s', len(weibos), self.api_url) + else: + logger.info(u'没有获取到微博,略过API POST') From c9cf2181269926f89325c77035e5bdf9f3625100 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BA=8C=E4=B9=94?= <605056080@qq.com> Date: Sun, 28 Apr 2024 17:04:19 +0800 Subject: [PATCH 359/363] =?UTF-8?q?issues=5Ffeature=5Fpost=5Fapi=5F576=20?= =?UTF-8?q?=E5=AE=9E=E7=8E=B0=E9=80=9A=E8=BF=87POST=E6=96=B9=E5=BC=8F?= =?UTF-8?q?=E5=B0=86=E6=95=B0=E6=8D=AE=E6=8E=A8=E9=80=81=E5=88=B0=E8=87=AA?= =?UTF-8?q?=E5=AE=9A=E4=B9=89=E6=8E=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/parser/comment_parser.py | 2 +- weibo_spider/writer/post_writer.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/parser/comment_parser.py index 6e06c776..c0117d80 100644 --- a/weibo_spider/parser/comment_parser.py +++ b/weibo_spider/parser/comment_parser.py @@ -33,7 +33,7 @@ def get_long_weibo(self): # 3. 去掉所有 HTML 标签,但保留标签内的有效文本 new_content = fromstring(html_string).text_content() # 4. 替换多个连续的 \n 为一个 \n - new_content = re.sub(r'\n+', '\n', new_content) + new_content = re.sub(r'\n+\s*', '\n', new_content) weibo_content = handle_garbled(new_content) if weibo_content is not None: return weibo_content diff --git a/weibo_spider/writer/post_writer.py b/weibo_spider/writer/post_writer.py index 7446fbea..af536623 100644 --- a/weibo_spider/writer/post_writer.py +++ b/weibo_spider/writer/post_writer.py @@ -5,6 +5,8 @@ import requests from .writer import Writer +from time import sleep +from requests.exceptions import RequestException logger = logging.getLogger('spider.post_writer') From 7159638c487afd86cbbe39994e27e09370506c26 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 19:58:52 +0000 Subject: [PATCH 360/363] build(deps): bump tqdm from 4.46.1 to 4.66.3 Bumps [tqdm](https://github.com/tqdm/tqdm) from 4.46.1 to 4.66.3. - [Release notes](https://github.com/tqdm/tqdm/releases) - [Commits](https://github.com/tqdm/tqdm/compare/v4.46.1...v4.66.3) --- updated-dependencies: - dependency-name: tqdm dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 68aaf6d1..097c14b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ lxml==4.9.1 requests==2.31.0 -tqdm==4.46.1 +tqdm==4.66.3 absl-py==0.12.0 \ No newline at end of file From ae9224ec081af5ba94f79b4743b56d97165dfa3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BA=B5=E7=BA=AC?= <19372775+zangwill@users.noreply.github.com> Date: Fri, 17 May 2024 11:13:05 +0800 Subject: [PATCH 361/363] =?UTF-8?q?=E8=A7=A3=E5=86=B3=20'NoneType'=20objec?= =?UTF-8?q?t=20has=20no=20attribute=20'xpath'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit handle_html 函数返回 Element 或者 None,`page_parser.py` 没有处理 None 的情况 ```py 'NoneType' object has no attribute 'xpath' Traceback (most recent call last): File "C:\Users\zw\miniconda3\lib\site-packages\weibo_spider\spider.py", line 178, in get_weibo_info weibos, self.weibo_id_list, to_continue = PageParser( File "C:\Users\zw\miniconda3\lib\site-packages\weibo_spider\parser\page_parser.py", line 47, in __init__ info = self.selector.xpath("//div[@class='c']") AttributeError: 'NoneType' object has no attribute 'xpath' ``` --- weibo_spider/parser/page_parser.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index fcd17142..6e356440 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -44,10 +44,11 @@ def __init__(self, cookie, user_config, page, filter): is_exist = '' for i in range(3): self.selector = handle_html(self.cookie, self.url) - info = self.selector.xpath("//div[@class='c']") - if info is None or len(info) == 0: - continue - is_exist = info[0].xpath("div/span[@class='ctt']") + if self.selector: + info = self.selector.xpath("//div[@class='c']") + if info is None or len(info) == 0: + continue + is_exist = info[0].xpath("div/span[@class='ctt']") if is_exist: PageParser.empty_count = 0 break From 496c09cbc5332ac828937d8487a700c6dc01cc7a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 21 May 2024 05:33:04 +0000 Subject: [PATCH 362/363] --- updated-dependencies: - dependency-name: requests dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 097c14b7..23b93002 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ lxml==4.9.1 -requests==2.31.0 +requests==2.32.0 tqdm==4.66.3 absl-py==0.12.0 \ No newline at end of file From 8c49c81168caa110f3e774ab2713c6a847f0be72 Mon Sep 17 00:00:00 2001 From: kyle-qi Date: Tue, 24 Mar 2026 15:22:02 +0800 Subject: [PATCH 363/363] =?UTF-8?q?fix:=20=E6=94=B9=E8=BF=9B=E8=A2=AB?= =?UTF-8?q?=E5=B0=81=E7=A6=81=E8=B4=A6=E5=8F=B7=E7=88=AC=E5=8F=96=E5=8F=8A?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BB=A3=E7=90=86=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 添加全局代理配置支持(config.json 中配置 proxy 字段) 2. handle_html 增加 403/432 等状态码的差异化处理和重试策略 3. info_parser 兼容自己查看自己资料页和查看他人资料页两种HTML结构 4. 学习经历/工作经历提取改用 following-sibling 定位,更健壮 5. page_parser 异常时返回空列表而非 None,避免上层解包报错 6. 文件下载和视频URL获取均支持代理 --- weibo_spider/downloader/downloader.py | 4 +- weibo_spider/parser/info_parser.py | 71 +++++++++++++++++------- weibo_spider/parser/page_parser.py | 1 + weibo_spider/parser/util.py | 79 ++++++++++++++++++--------- weibo_spider/spider.py | 5 ++ 5 files changed, 113 insertions(+), 47 deletions(-) diff --git a/weibo_spider/downloader/downloader.py b/weibo_spider/downloader/downloader.py index 4a07d67b..ac75c608 100644 --- a/weibo_spider/downloader/downloader.py +++ b/weibo_spider/downloader/downloader.py @@ -36,9 +36,11 @@ def download_one_file(self, url, file_path, weibo_id): s = requests.Session() s.mount(url, HTTPAdapter(max_retries=self.file_download_timeout[0])) + from ..parser.util import get_proxies downloaded = s.get(url, timeout=(self.file_download_timeout[1], - self.file_download_timeout[2])) + self.file_download_timeout[2]), + proxies=get_proxies()) with open(file_path, 'wb') as f: f.write(downloaded.content) except Exception as e: diff --git a/weibo_spider/parser/info_parser.py b/weibo_spider/parser/info_parser.py index 90164554..ad39a2b2 100644 --- a/weibo_spider/parser/info_parser.py +++ b/weibo_spider/parser/info_parser.py @@ -25,31 +25,64 @@ def extract_user_info(self): sys.exit() user.nickname = nickname - basic_info = self.selector.xpath("//div[@class='c'][3]/text()") zh_list = [u'性别', u'地区', u'生日', u'简介', u'认证', u'达人'] en_list = [ 'gender', 'location', 'birthday', 'description', 'verified_reason', 'talent' ] + + # 先尝试标准格式(查看他人资料页) + basic_info = self.selector.xpath("//div[@class='c'][3]/text()") + has_info = any( + ':' in str(i) and str(i).split(':', 1)[0] in zh_list + for i in basic_info) + + if not has_info: + # 自己查看自己的资料页:标签在标签内,值在的tail文本中 + basic_info = [] + for c_div in self.selector.xpath("//div[@class='c']"): + a_texts = c_div.xpath('a/text()') + if u'性别' in a_texts or u'昵称' in a_texts: + for a in c_div.xpath('a'): + label = (a.text or '').strip() + tail = (a.tail or '').strip() + if label in zh_list and tail.startswith(':'): + basic_info.append(label + tail) + break + for i in basic_info: - if i.split(':', 1)[0] in zh_list: - setattr(user, en_list[zh_list.index(i.split(':', 1)[0])], - i.split(':', 1)[1].replace('\u3000', '')) - - experienced = self.selector.xpath("//div[@class='tip'][2]/text()") - if experienced and experienced[0] == u'学习经历': - user.education = self.selector.xpath( - "//div[@class='c'][4]/text()")[0][1:].replace( - u'\xa0', u' ') - if self.selector.xpath( - "//div[@class='tip'][3]/text()")[0] == u'工作经历': - user.work = self.selector.xpath( - "//div[@class='c'][5]/text()")[0][1:].replace( - u'\xa0', u' ') - elif experienced and experienced[0] == u'工作经历': - user.work = self.selector.xpath( - "//div[@class='c'][4]/text()")[0][1:].replace( - u'\xa0', u' ') + if ':' in str(i) and str(i).split(':', 1)[0] in zh_list: + setattr(user, en_list[zh_list.index(str(i).split(':', 1)[0])], + str(i).split(':', 1)[1].replace('\u3000', '')) + + # 提取学习经历和工作经历,使用following-sibling定位,兼容自己和他人页面 + tip_divs = self.selector.xpath("//div[@class='tip']") + for tip in tip_divs: + tip_text = tip.xpath('string(.)').strip() + if tip_text == u'学习经历': + edu_div = tip.xpath( + 'following-sibling::div[@class="c"][1]') + if edu_div: + # 优先用text()(他人页面),fallback用string(.)(自己页面) + edu_text = edu_div[0].xpath('text()') + if edu_text and len(edu_text[0].strip()) > 1: + user.education = edu_text[0][1:].replace( + u'\xa0', u' ') + else: + user.education = ' '.join( + edu_div[0].xpath('string(.)').split()) + elif tip_text == u'工作经历': + work_div = tip.xpath( + 'following-sibling::div[@class="c"][1]') + if work_div: + work_text = work_div[0].xpath('text()') + if work_text and len(work_text[0].strip()) > 1: + user.work = work_text[0][1:].replace( + u'\xa0', u' ') + else: + user.work = ' '.join( + work_div[0].xpath('string(.)').split()) + return user except Exception as e: logger.exception(e) diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index 6e356440..6acf1ad9 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -91,6 +91,7 @@ def get_one_page(self, weibo_id_list): return weibos, weibo_id_list, self.to_continue except Exception as e: logger.exception(e) + return [], weibo_id_list, self.to_continue def is_original(self, info): """判断微博是否为原创微博""" diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py index 81aa4297..b7c95736 100644 --- a/weibo_spider/parser/util.py +++ b/weibo_spider/parser/util.py @@ -12,37 +12,62 @@ URL_MAP_FILE = 'url_map.json' logger = logging.getLogger('spider.util') +# 全局代理配置,由 spider.py 初始化 +_proxies = None + + +def set_proxies(proxy_url): + """设置全局代理""" + global _proxies + if proxy_url: + _proxies = {'http': proxy_url, 'https': proxy_url} + logger.info(u'已启用代理: %s', proxy_url) + + +def get_proxies(): + return _proxies + def hash_url(url): return hashlib.sha224(url.encode('utf8')).hexdigest() +DEFAULT_UA = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/133.0.0.0 Safari/537.36') + + def handle_html(cookie, url): """处理html""" - try: - user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' - headers = {'User_Agent': user_agent, 'Cookie': cookie} - resp = requests.get(url, headers=headers) - - if GENERATE_TEST_DATA: - import io - import os - - resp_file = os.path.join(TEST_DATA_DIR, '%s.html' % hash_url(url)) - with io.open(resp_file, 'w', encoding='utf-8') as f: - f.write(resp.text) - - with io.open(os.path.join(TEST_DATA_DIR, URL_MAP_FILE), 'r+') as f: - url_map = json.loads(f.read()) - url_map[url] = resp_file - f.seek(0) - f.write(json.dumps(url_map, indent=4, ensure_ascii=False)) - f.truncate() - - selector = etree.HTML(resp.content) - return selector - except Exception as e: - logger.exception(e) + from time import sleep + headers = {'User-Agent': DEFAULT_UA, 'Cookie': cookie} + for attempt in range(5): + try: + resp = requests.get(url, headers=headers, timeout=10, + proxies=_proxies) + if resp.status_code == 200 and len(resp.content) > 0: + selector = etree.HTML(resp.content) + return selector + elif resp.status_code == 403: + wait = 300 * (attempt + 1) + logger.warning(u'403 IP被限制,等待%d秒后重试(第%d次)', + wait, attempt + 1) + sleep(wait) + elif resp.status_code == 432: + logger.error(u'432 User-Agent被拒绝,请更新UA') + return None + else: + wait = 60 * (attempt + 1) + logger.warning(u'请求返回状态码%d,等待%d秒后重试(第%d次)', + resp.status_code, wait, attempt + 1) + sleep(wait) + except Exception as e: + wait = 60 * (attempt + 1) + logger.warning(u'请求异常,等待%d秒后重试(第%d次): %s', + wait, attempt + 1, str(e)) + sleep(wait) + logger.error(u'请求%s失败,已重试5次', url) + return None def handle_garbled(info): @@ -95,9 +120,9 @@ def to_video_download_url(cookie, video_page_url): video_object_url = video_page_url.replace('m.weibo.cn/s/video/show', 'm.weibo.cn/s/video/object') try: - user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' - headers = {'User_Agent': user_agent, 'Cookie': cookie} - wb_info = requests.get(video_object_url, headers=headers).json() + headers = {'User-Agent': DEFAULT_UA, 'Cookie': cookie} + wb_info = requests.get(video_object_url, headers=headers, + proxies=_proxies).json() video_url = wb_info['data']['object']['stream'].get('hd_url') if not video_url: video_url = wb_info['data']['object']['stream']['url'] diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index e0d2e41e..09125233 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -385,6 +385,11 @@ def main(_): try: config = _get_config() config_util.validate_config(config) + # 初始化代理 + proxy = config.get('proxy') + if proxy: + from .parser.util import set_proxies + set_proxies(proxy) wb = Spider(config) wb.start() # 爬取微博信息 except Exception as e: