2022-11-24 22:39:58 -08:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*- coding: UTF-8 -*-
|
|
|
|
|
|
|
|
import os
|
|
|
|
import hashlib
|
|
|
|
import tempfile
|
|
|
|
import shutil
|
|
|
|
import traceback
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
|
|
requests.packages.urllib3.disable_warnings()
|
|
|
|
|
|
|
|
# 获取文件md5
|
|
|
|
|
|
|
|
|
|
|
|
def file2md5(_file):
|
|
|
|
with open(_file, 'rb') as f:
|
|
|
|
md5obj = hashlib.md5()
|
|
|
|
md5obj.update(f.read())
|
|
|
|
return md5obj.hexdigest()
|
|
|
|
|
|
|
|
# 搜索代码
|
|
|
|
|
|
|
|
|
|
|
|
def searchcode(keyword, page=1, per_page=100):
|
|
|
|
headers = {
|
|
|
|
'Authorization': 'token {}'.format(os.getenv('GH_TOKEN')),
|
|
|
|
'Connection': 'close',
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
|
|
|
|
}
|
|
|
|
data = {'q': keyword, 'sort': 'indexed',
|
|
|
|
'order': 'desc', 'page': page, 'per_page': per_page}
|
|
|
|
try:
|
|
|
|
rj = requests.get('https://api.github.com/search/code', params=data,
|
|
|
|
headers=headers, verify=False, allow_redirects=False, timeout=10).json()
|
|
|
|
return rj
|
|
|
|
except:
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
# 旧poc
|
|
|
|
old_poc = {}
|
|
|
|
for path in ['go', 'json']:
|
|
|
|
for file in os.listdir(path):
|
|
|
|
if not file.endswith('.json') and not file.endswith('.go'):
|
|
|
|
continue
|
|
|
|
old_poc[file2md5(os.path.join(path, file))] = 0
|
|
|
|
|
|
|
|
root_path = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
print(root_path)
|
|
|
|
# 搜索代码获取项目主页
|
|
|
|
html_urls = []
|
|
|
|
for keyword in ['GobyQuery+language:Go', 'GobyQuery+language:Json']:
|
2022-11-27 06:47:24 -08:00
|
|
|
for i in range(1, 11):
|
2022-11-25 02:49:59 -08:00
|
|
|
try:
|
|
|
|
rs = searchcode(keyword, page=i, per_page=100)
|
|
|
|
html_urls += [item['repository']['html_url']
|
2022-11-27 06:47:24 -08:00
|
|
|
for item in rs.get('items', []) if item.get('repository', {}).get('html_url')]
|
2022-11-25 02:49:59 -08:00
|
|
|
except:
|
|
|
|
traceback.print_exc()
|
2022-11-24 22:39:58 -08:00
|
|
|
html_urls = set(html_urls)
|
|
|
|
for url in html_urls:
|
|
|
|
print(url)
|
|
|
|
try:
|
|
|
|
temp_dir = tempfile.TemporaryDirectory().name
|
|
|
|
if not os.path.exists(temp_dir):
|
|
|
|
os.makedirs(temp_dir)
|
|
|
|
os.chdir(temp_dir)
|
|
|
|
# clone项目
|
|
|
|
os.system('git clone {}'.format(url))
|
|
|
|
author, repo = url[19:].split('/', 1)
|
|
|
|
repo_path = os.path.join(temp_dir, repo)
|
|
|
|
print(repo_path)
|
|
|
|
# 复制poc
|
|
|
|
if os.path.exists(repo_path):
|
|
|
|
for root, _, files in os.walk(repo_path):
|
|
|
|
for file in files:
|
|
|
|
if not file.endswith('.json') and not file.endswith('.go'):
|
|
|
|
continue
|
|
|
|
file_path = os.path.join(root, file)
|
2022-11-24 23:37:09 -08:00
|
|
|
try:
|
|
|
|
with open(file_path, 'r', encoding='utf8') as f:
|
|
|
|
content = f.read()
|
|
|
|
if 'GobyQuery' in content and 'ScanSteps' in content:
|
2022-11-25 00:30:09 -08:00
|
|
|
print(file_path)
|
2022-11-24 23:37:09 -08:00
|
|
|
md5 = file2md5(file_path)
|
|
|
|
if md5 not in old_poc:
|
|
|
|
if file.endswith('.json'):
|
|
|
|
shutil.copyfile(file_path, os.path.join(
|
|
|
|
root_path, 'json', file))
|
|
|
|
if file.endswith('.go'):
|
|
|
|
shutil.copyfile(file_path, os.path.join(
|
|
|
|
root_path, 'go', file))
|
|
|
|
except:
|
|
|
|
traceback.print_exc()
|
2022-11-24 22:39:58 -08:00
|
|
|
os.chdir(root_path)
|
|
|
|
except:
|
|
|
|
traceback.print_exc()
|
|
|
|
os.chdir(root_path)
|
2022-11-27 06:47:24 -08:00
|
|
|
with open('README.md', 'w', encoding='utf8') as f:
|
2022-11-27 20:19:58 -08:00
|
|
|
f.write('# Goby POC统计\n| 文件类型 | 数量 |\n| :----:| :----: |\n| .go | {} |\n| .json | {} |'.format(
|
2022-11-27 06:47:24 -08:00
|
|
|
len([file for file in os.listdir('go') if file.endswith('.go')]), len([file for file in os.listdir('json') if file.endswith('.json')])))
|