前言
做iOS马甲包时, 需要添加混淆代码, 一部分便来自github, 抽空写了个脚本, 使用的scrapy
框架
一, Item 模型
import scrapy
class GithubItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field() # 框架名称
url = scrapy.Field() # 框架主页路径
star_number = scrapy.Field() # 框架的star数量
update_time = scrapy.Field() # 更新时间
clone_url = scrapy.Field() # clone地址
二, 爬取框架要求
- 指定语言oc
- star数量至少为100
from github.items import GithubItem
import scrapy
class GithubSpider(scrapy.Spider):
name = 'github'
allowed_domains = ['github.com']
url = 'https://github.com/search?l=Objective-C&o=desc&q=ios&s=stars&p='
offset = 1
start_urls = [
url+str(offset)
]
# 所有的模型
items = []
# 保存clone_url
clone_urls = []
def parse(self, response):
# parentUrls = response.xpath('//div[@id=\"tab01\"]/div/h3/a/@href').extract()
names = response.xpath('//ul[@class=\"repo-list\"]/li/div/h3/a/text()').extract()
# 访问url全路径,前面需要拼接:https://github.com
urls = response.xpath('//ul[@class=\"repo-list\"]/li/div/h3/a/@href').extract()
star_numbers:list = response.xpath('//ul[@class=\"repo-list\"]/li/div//a[@class=\"muted-link\"]/text()').extract()
# 去掉无效值及空白回车
for i in range(len(star_numbers)-1, -1, -1):
temp:str = star_numbers[i].strip()
if len(temp) > 2 or temp.find('k')!=-1:
star_numbers[i] = temp
update_times = response.xpath('//ul[@class=\"repo-list\"]/li/div//p[@class=\"f6 text-gray mr-3 mb-0 mt-2\"]/relative-time/text()').extract()
for i in range(0, len(names)):
item = GithubItem()
item['name'] = names[i]
item['url'] = 'https://github.com' + urls[i]
item['star_number'] = star_numbers[i]
item['update_time'] = update_times[i]
self.items.append(item)
for item in self.items:
yield scrapy.Request(url=item['url'], meta={'item':item}, callback=self.parse_article)
if self.offset < 3:
self.offset += 1
# 每次处理完一页的数据之后,重新发送下一页页面请求
yield scrapy.Request(self.url+str(self.offset), callback=self.parse)
def parse_article(self, response):
"""
解析框架主页,提取其中的clone地址
:param response:
:return:
"""
item = response.meta['item']
item['clone_url'] = response.xpath('//div[@class=\"input-group\"]/input/@value').extract()
self.clone_urls.append(item['url'])
print('clone_url: %s'%self.clone_urls)
# print(item)
注: 修改此处控制拉取数量:
if self.offset < 3:
self.offset += 1
三, clone到指定本地路径
import os
import pwd
import shutil
class GitUtil(object):
@classmethod
def yh_clone_url(cls, url):
# 确定路径
dst_path = cls._dst_path_of_clone(url)
if os.path.exists(dst_path):
shutil.rmtree(dst_path)
os.mkdir(dst_path)
# 执行shell
cls._clone_url_to_dst_path(url, dst_path)
output = os.popen("pwd")
print (output.read())
@classmethod
def _clone_url_to_dst_path(cls, url, dst_path):
"""
clone到目的路径
:param url: 将要clone的仓库
:param dst_path: 目的路径
:return:
"""
os.system("git clone " + url + " " + dst_path)
@classmethod
def _dst_path_of_clone(cls, url):
# 确定路径, 得到框架名称
last_path = os.path.split(url)[-1]
framework_name = os.path.splitext(last_path)[0]
home_path = pwd.getpwuid(os.getuid()).pw_dir
dst_folder_path = os.path.join(home_path, 'Desktop/tmp_git')
if os.path.exists(dst_folder_path):
shutil.rmtree(dst_folder_path)
os.mkdir(dst_folder_path)
dst_path = os.path.join(dst_folder_path, framework_name)
return dst_path
四, main.py
执行此脚本
from scrapy import cmdline
cmdline.execute('scrapy crawl github'.split())
网友评论