这个网页的内容能提取呀

@Ta 2020-02-03 8439点击
https://ncov.dxy.cn/ncovh5/view/pneumonia?

nys.py(806 bytes)
我想提出各个地区的一些数量,蒙了一天还是没有写出来,有友友帮忙指点一下吗?
回复列表(10|隐藏机器人聊天)
  • @Ta / 2020-02-03 / /
    用这个requests_html我是一名斗者,我正在被一名斗帝追杀,但我一点都不慌,因为他的马没我快,架!!!
  • hik
    @Ta / 2020-02-03 / /
    复制粘贴到excel中即可
  • @Ta / 2020-02-03 / /
    @姑娘等等丶,requests_html?与requests有什么不同吗?真诚请教
  • @Ta / 2020-02-03 / /
    @hik,这。。。
  • @Ta / 2020-02-03 / /
    @yes_h_ut,8行代码
    #coding=utf-8
    # 爬取 丁香园 - 丁香医生
    import re
    from requests_html import HTMLSession
    session = HTMLSession()
    url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia?'
    r = session.get(url)
    str = r.html.find('script#getListByCountryTypeService2', first=True).text
    json = re.search("\[(.*?)\]",str)
    print(json.group())
    
    我是一名斗者,我正在被一名斗帝追杀,但我一点都不慌,因为他的马没我快,架!!!
  • @Ta / 2020-02-04 / /
    @姑娘等等丶,[{
    "id": 953,
    "createTime": 1580027704000,
    "modifyTime": 1580561200000,
    "tags": "",
    "countryType": 2,
    "continents": "亚洲",
    "provinceId": "6",
    "provinceName": "日本",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 20,
    "suspectedCount": 0,
    "curedCount": 1,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "xuyt",
    "locationId": 951002
    }, {
    "id": 949,
    "createTime": 1580027637000,
    "modifyTime": 1580495765000,
    "tags": "",
    "countryType": 2,
    "continents": "亚洲",
    "provinceId": "2",
    "provinceName": "泰国",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 19,
    "suspectedCount": 0,
    "curedCount": 5,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "xuyt",
    "locationId": 952010
    }, {
    "id": 950,
    "createTime": 1580027655000,
    "modifyTime": 1580619404000,
    "tags": "",
    "countryType": 2,
    "continents": "亚洲",
    "provinceId": "3",
    "provinceName": "新加坡",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 18,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "xuyt",
    "locationId": 952009
    }, {
    "id": 954,
    "createTime": 1580027721000,
    "modifyTime": 1580609979000,
    "tags": "",
    "countryType": 2,
    "continents": "亚洲",
    "provinceId": "7",
    "provinceName": "韩国",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 15,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "xuyt",
    "locationId": 951004
    }, {
    "id": 958,
    "createTime": 1580027777000,
    "modifyTime": 1580620070000,
    "tags": "",
    "countryType": 2,
    "continents": "大洋洲",
    "provinceId": "10",
    "provinceName": "澳大利亚",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 12,
    "suspectedCount": 0,
    "curedCount": 2,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "xuyt",
    "locationId": 990001
    }, {
    "id": 1047,
    "createTime": 1580167302000,
    "modifyTime": 1580784147000,
    "tags": "",
    "countryType": 2,
    "continents": "欧洲",
    "provinceId": "10",
    "provinceName": "德国",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 12,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "shiweiyi",
    "locationId": 963003
    }, {
    "id": 955,
    "createTime": 1580027735000,
    "modifyTime": 1580771385000,
    "tags": "",
    "countryType": 2,
    "continents": "北美洲",
    "provinceId": "8",
    "provinceName": "美国",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 11,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "xuyt",
    "locationId": 971002
    }, {
    "id": 951,
    "createTime": 1580027668000,
    "modifyTime": 1580495813000,
    "tags": "",
    "countryType": 2,
    "continents": "亚洲",
    "provinceId": "4",
    "provinceName": "马来西亚",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 8,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "xuyt",
    "locationId": 952007
    }, {
    "id": 952,
    "createTime": 1580027683000,
    "modifyTime": 1580516875000,
    "tags": "",
    "countryType": 2,
    "continents": "欧洲",
    "provinceId": "5",
    "provinceName": "法国",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 6,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "xuyt",
    "locationId": 961002
    }, {
    "id": 956,
    "createTime": 1580027751000,
    "modifyTime": 1580539329000,
    "tags": "",
    "countryType": 2,
    "continents": "亚洲",
    "provinceId": "9",
    "provinceName": "越南",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 6,
    "suspectedCount": 0,
    "curedCount": 1,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "xuyt",
    "locationId": 952011
    }, {
    "id": 1650,
    "createTime": 1580276720000,
    "modifyTime": 1580621660000,
    "tags": "",
    "countryType": 2,
    "continents": "亚洲",
    "provinceId": "10",
    "provinceName": "阿联酋",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 5,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "yuting",
    "locationId": 955019
    }, {
    "id": 959,
    "createTime": 1580027795000,
    "modifyTime": 1580552879000,
    "tags": "",
    "countryType": 2,
    "continents": "北美洲",
    "provinceId": "10",
    "provinceName": "加拿大",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 4,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "xuyt",
    "locationId": 971001
    }, {
    "id": 1954,
    "createTime": 1580375521000,
    "modifyTime": 1580622802000,
    "tags": "",
    "countryType": 2,
    "continents": "亚洲",
    "provinceId": "10",
    "provinceName": "印度",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 2,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "gaoh",
    "locationId": 953003
    }, {
    "id": 1957,
    "createTime": 1580425693000,
    "modifyTime": 1580516912000,
    "tags": "",
    "countryType": 2,
    "continents": "欧洲",
    "provinceId": "10",
    "provinceName": "意大利",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 2,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "shiweiyi",
    "locationId": 965008
    }, {
    "id": 2280,
    "createTime": 1580464639000,
    "modifyTime": 1580495938000,
    "tags": "",
    "countryType": 2,
    "continents": "欧洲",
    "provinceId": "10",
    "provinceName": "英国",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 2,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "leimanman",
    "locationId": 961007
    }, {
    "id": 2281,
    "createTime": 1580476133000,
    "modifyTime": 1580495947000,
    "tags": "",
    "countryType": 2,
    "continents": "欧洲",
    "provinceId": "10",
    "provinceName": "俄罗斯",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 2,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "hekaiqi",
    "locationId": 964006
    }, {
    "id": 2870,
    "createTime": 1580566796000,
    "modifyTime": 1580614212000,
    "tags": "",
    "countryType": 2,
    "continents": "亚洲",
    "provinceId": "10",
    "provinceName": "菲律宾",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 2,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 1,
    "comment": "",
    "sort": 0,
    "operator": "hekaiqi",
    "locationId": 952008
    }, {
    "id": 957,
    "createTime": 1580027764000,
    "modifyTime": 1580495955000,
    "tags": "",
    "countryType": 2,
    "continents": "亚洲",
    "provinceId": "10",
    "provinceName": "尼泊尔",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 1,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "xuyt",
    "locationId": 953005
    }, {
    "id": 1023,
    "createTime": 1580130061000,
    "modifyTime": 1580495965000,
    "tags": "",
    "countryType": 2,
    "continents": "亚洲",
    "provinceId": "10",
    "provinceName": "柬埔寨",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 1,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "hanting",
    "locationId": 952003
    }, {
    "id": 1306,
    "createTime": 1580182019000,
    "modifyTime": 1580495982000,
    "tags": "",
    "countryType": 2,
    "continents": "亚洲",
    "provinceId": "10",
    "provinceName": "斯里兰卡",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 1,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "zhuotingting",
    "locationId": 953007
    }, {
    "id": 1656,
    "createTime": 1580337882000,
    "modifyTime": 1580495991000,
    "tags": "",
    "countryType": 2,
    "continents": "欧洲",
    "provinceId": "10",
    "provinceName": "芬兰",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 1,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "zhuotingting",
    "locationId": 962002
    }, {
    "id": 2293,
    "createTime": 1580512517000,
    "modifyTime": 1580514491000,
    "tags": "",
    "countryType": 2,
    "continents": "欧洲",
    "provinceId": "10",
    "provinceName": "西班牙",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 1,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "zhuotingting",
    "locationId": 965015
    }, {
    "id": 2596,
    "createTime": 1580516851000,
    "modifyTime": 1580516851000,
    "tags": "",
    "countryType": 2,
    "continents": "欧洲",
    "provinceId": "10",
    "provinceName": "瑞典",
    "provinceShortName": "",
    "cityName": "",
    "confirmedCount": 1,
    "suspectedCount": 0,
    "curedCount": 0,
    "deadCount": 0,
    "comment": "",
    "sort": 0,
    "operator": "leimanman",
    "locationId": 962005
    }]
    最终获得的是这样一结果。
    如果要获得id为953下边的provinceName的内容要怎么去操作,(说个思路就行)
    但是类型是str,这个是不是有必要转成dict类型
  • @Ta / 2020-02-04 / /
    @yes_h_ut,百度搜json在线解析,复制进去解析后分析,再百度搜python json解析我是一名斗者,我正在被一名斗帝追杀,但我一点都不慌,因为他的马没我快,架!!!
  • @Ta / 2020-02-04 / /
    @姑娘等等丶, 
    百度基本上是以bs4库处理。。。
    ....
    r=requests.get(url1,headers=headers)
    bs = BeautifulSoup(r.content.decode("utf-8"), "lxml")
    # print(bs.prettify())#格式化打印bs的内容
    bs1=bs.find_all('script',attrs={'id':'getAreaStat'})
    得到的结果(遇到的问题):
    1.含有标签头[<script id="getAreaStat">,尝试用json1 = re.search("\[(.*?)\]",str1)将标签头去掉,提示类型不对【find_all后的输出如何不带tag】
    2.输出的类型是 bs4.element.ResultSet,长度为1,不能用for循环取值。
    是否需要将类型转化才能以下面的结构输出:
    for ix in range(1,len(decodejson)):
        if '浙江省' in decodejson[ix]['provinceName']:
            pprint.pprint(str(ix))
            pprint.pprint('death: '+str(decodejson[ix]['deadCount']))
            pprint.pprint('confirmedCount: '+str(decodejson[ix]['confirmedCount']))
            pprint.pprint('suspectedCount: '+str(decodejson[ix]['suspectedCount']))
    以上

  • @Ta / 2020-02-04 / /
    @yes_h_ut
    #coding=utf-8
    # 爬取 丁香园 - 丁香医生
    import re
    import json
    from requests_html import HTMLSession
    session = HTMLSession()
    url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia?'
    r = session.get(url)
    str = r.html.find('script#getListByCountryTypeService2', first=True).text
    data = re.search("\[(.*?)\]",str)
    json_str = json.loads(data.group())
    print(json_str[0]) #数组
    print(json_str[0]["provinceName"]) #数组内json对象
    
    我是一名斗者,我正在被一名斗帝追杀,但我一点都不慌,因为他的马没我快,架!!!
  • @Ta / 2020-02-04 / /
    @姑娘等等丶
    #encoding='UTF-8 
    import requests
    import os
    from bs4 import BeautifulSoup
    import re
    import json
    import pprint
    url1='https://ncov.dxy.cn/ncovh5/view/pneumonia?'
    headers={
        'User-Agent':'self-defind-user-agent',
        'Cookie':'name=self-define-cookies-in header',
        'charset':'utf-8'
    }

    r=requests.get(url1,headers=headers)
    bs = BeautifulSoup(r.content.decode("utf-8"), "lxml")
    # print(bs.prettify())#格式化打印bs的内容
    bs1=bs.find_all('script',attrs={'id':'getAreaStat'})
    print(str(type(bs1[0])))
    print('............................................')
    x=[]
    x=bs1[0].string
    re_result = re.search("\[(.*?)\}]}]",x)#\[(.*?)\]       }]}]
    # pprint.pprint(re_result.group())
    rrr=re_result.group()[0:]
    decodejson = json.loads(rrr)
    # pprint.pprint(decodejson)
    pprint.pprint(str(type(decodejson)))
    print('............................................')
    # with open('7.txt','w',encoding='utf-8') as f:
    #         f.write(json.dumps(decodejson,ensure_ascii=False))
    # pprint.pprint('death: '+str(decodejson[10]['cities'][2]['deadCount']))

    # pprint.pprint(str(type(decodejson[10]['cities'])))
    # for m in range(1,len(decodejson[10]['cities'])):
    #     pprint.pprint(str(decodejson[10]['cities'][m]['cityName']))

    for ix in range(0,len(decodejson)):
        for iy in range(0,len(decodejson[ix]['cities'])):
            # pprint.pprint('death: '+str(decodejson[ix]['cities'][iy]['deadCount']))
            if '成都' in decodejson[ix]['cities'][iy]['cityName']:            
                pprint.pprint('cityName: '+str(decodejson[ix]['cities'][iy]['cityName'])) 
                pprint.pprint('confirmedCount: '+str(decodejson[ix]['cities'][iy]['confirmedCount']))            
                pprint.pprint('suspectedCount: '+str(decodejson[ix]['cities'][iy]['suspectedCount']))   
                pprint.pprint('curedCount: '+str(decodejson[ix]['cities'][iy]['curedCount']))   
                pprint.pprint('deadCount: '+str(decodejson[ix]['cities'][iy]['deadCount'])) 
    print('............................................')
    谢谢指点。
    折腾好久才弄出来,不过总感觉有些操作是没有用的。
添加新回复
回复需要登录