-
目标:https://www.wsj.com/
-
核心需求:
登录华尔街日报需要_csrf参数的值,本文目的就是为了获得这个_csrf的值。
求值流程:
第一步:
创建requests.Session()对象,
获得这个URL https://accounts.wsj.com/login?target=https%3A%2F%2Fwww.wsj.com%2F
第二步:
用requests.get得到该URL内容里已加密的base64值
第三步:
用base64.b64decode解密得到一个字典,取出_csrf
{"assetsUrl":"https://sso-config.accounts.dowjones.com/","auth0Domain":"sso.accounts.dowjones.com","auth0Tenant":"sso","callbackOnLocationHash":false,"callbackURL":"https://accounts.wsj.com/auth/sso/login","cdn":"https://sso-config.accounts.dowjones.com/","clientID":"5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO","dict":{"signin":{"title":"DowJonesWeb"}},"extraParams":{"protocol":"oauth2","scope":"openid idp_id roles email given_name family_name djid djUsername djStatus trackid tags prts","response_type":"code","nonce":"034ed2ac-7b52-43ce-a21b-1a9a5ba1766f","ui_locales":"en-us-x-wsj-19-2","ns":"prod/accounts-wsj","_csrf":"R1h5S3fd-3CUFmsKi5jad5wN37L7kK93Pb6Y","_intstate":"deprecated","state":"g6Fo2SBQR1h4WWNSNXBiZDlCWkRuR1V5b2c5RlBENzBIdGdRd6N0aWTZIFYxMnM3LWUxVHRoMDlWUlRkLXhDV3ZJOTdKVHpaNFBzo2NpZNkgNWhzc0VBZE15MG1KVElDbkpOdkM5VFhFdzNWYTdqZk8"},"internalOptions":{"protocol":"oauth2","scope":"openid idp_id roles email given_name family_name djid djUsername djStatus trackid tags prts","response_type":"code","nonce":"034ed2ac-7b52-43ce-a21b-1a9a5ba1766f","ui_locales":"en-us-x-wsj-19-2","ns":"prod/accounts-wsj","_csrf":"R1h5S3fd-3CUFmsKi5jad5wN37L7kK93Pb6Y","_intstate":"deprecated","state":"g6Fo2SBQR1h4WWNSNXBiZDlCWkRuR1V5b2c5RlBENzBIdGdRd6N0aWTZIFYxMnM3LWUxVHRoMDlWUlRkLXhDV3ZJOTdKVHpaNFBzo2NpZNkgNWhzc0VBZE15MG1KVElDbkpOdkM5VFhFdzNWYTdqZk8"},"widgetUrl":"https://cdn.auth0.com/w2/auth0-widget-5.2.min.js","isThirdPartyClient":false,"authorizationServer":{"url":"https://sso.accounts.dowjones.com","issuer":"https://sso.accounts.dowjones.com/"},"colors":{}}
第四步:
随意输入错误的账号和密码通过F12找出URLhttps://sso.accounts.dowjones.com/usernamepassword/login
一一对应下图参数通过POST请求访问该URL,
取出请求内容中的wa、wresult和wctx值。
通过Fiddler抓包查看请求内容如下:
第五步:
通过下图得知前四步是为了得到wa、wresult和wctx值,带参POST请求该URLhttps://sso.accounts.dowjones.com/login/callback
最后返回requests.Session()对象
代码如下:
def login():
# 第一步
s = requests.Session()
s.keep_alive = False
url = 'https://accounts.wsj.com/login?target=https%3A%2F%2Fwww.wsj.com%2F'
# 第二步
res = s.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'})
txt = re.findall(r'Base64.decode\((.*?)\)\)', res.text)
# 第三步
b64_code = base64.b64decode(txt[0])
t = b64_code.decode(encoding='utf-8')
c = json.loads(t)
csrf = c['extraParams']['_csrf']
data = {
'client_id': "5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO",
'redirect_uri': "https://accounts.wsj.com/auth/sso/login",
'tenant': "sso",
'response_type': "code",
'username': "694021389@qq.com",
'password': "qq393305",
'scope': "openid idp_id roles email given_name family_name djid djUsername djStatus trackid tags prts",
'state': "g6Fo2SBia2txZGRZVkhKTnV6U20xM2xNZFdIbExja1VwMkFxc6N0aWTZIFJvemNJTlBIV085M0JCVXM2b2pOTUIxRmUwaHE5bmxVo2NpZNkgNWhzc0VBZE15MG1KVElDbkpOdkM5VFhFdzNWYTdqZk8",
'protocol': "oauth2",
'nonce': '447d921e-633f-4722-97b2-7c8f5a51f4f8',
'ui_locales': "en-us-x-wsj-19-2",
'ns': "prod/accounts-wsj",
'savelogin': "on",
'type': "code",
'_csrf': csrf,
'_intstate': "deprecated",
'headers': "{'x-_remote-_user': '694021389@qq.com'}",
'connection': "DJldap",
}
post_url = 'https://sso.accounts.dowjones.com/usernamepassword/login'
sss = json.dumps(data)
res2 = s.post(post_url, data=sss, headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'referer': 'https://www.nytimes.com/2020/04/08/obituaries/ahmed-ismail-hussein-dead-coronavirus.html',
'content-type': 'application/json',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,ku;q=0.8',
'auth0-client': 'eyJuYW1lIjoiYXV0aDAuanMtdWxwIiwidmVyc2lvbiI6IjkuMTEuMyJ9',
'content-length': '726',
'origin': 'https://sso.accounts.dowjones.com',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin'
})
callback_url = "https://sso.accounts.dowjones.com/login/callback"
tree = etree.HTML(res2.text)
# 第四步
c_data = {
'wa': tree.xpath('//input[@name="wa"]/@value')[0],
'wresult': tree.xpath('//input[@name="wresult"]/@value')[0],
'wctx': tree.xpath('//input[@name="wctx"]/@value')[0],
}
# 第五步
res3 = s.post(callback_url, data=c_data)
return s











网友评论