2017-06-01 106 views
0

所以在这里我有以下的列表和字典:字典映射在不同的列表

{'linked': {'instructors.v1': 
       [{'id': '3219339', 'fullName': 'Lisa Mazzola'}, 
        {'id': '6407572', 'fullName': 'Alan S. Miller '}, 
        {'id': '226710', 'fullName': 'Kevin Werbach'}, 
        {'id': '8054217', 'fullName': '许 肖潇'}, 
        {'id': '20696355', 'fullName': 'Варшалович Дмитрий Александрович'}, 
        {'id': '15622422', 'fullName': 'Prof. James Evans'}}], 

'elements': 
    [{ 'id': '69Bku0KoEeWZtA4u62x6lQ', 'name': 'Gamification','instructorIds': '226710'}] 

} 

我试图从“要素获得与“instructorIds”关联“instructors.v1”的“全名” '通过匹配两者。 我的方法 - >创建了另一个解释如下:

{'3219339': 'Lisa Mazzola'} 
{'6407572': 'Alan S. Miller'} 
{'226710': 'Kevin Werbach'} 

这给了我一个KeyError异常:“‘226710’”即使226710在list.Please确实存在提出另一种方法,否则,我要去哪里错误?

这里是供您参考Python代码:

import imp 
import importlib 
import requests 
import json 
import re 
from bs4 import BeautifulSoup 
import csv 
import sys 
import urllib.request 
from importlib import reload 

if __name__ == "__main__": 
headers = ({ 
    "x-user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 
(KHTML, like Gecko) Chrome/53.0.2785.92 Safari/537.36 
FKUA/website/41/website/Desktop"}) 

url = "https://api.coursera.org/api/courses.v1?start=0&limit=20&includes=instructorIds,partnerIds,specializations,s12nlds,v1Details,v2Details&fields=instructorIds,partnerIds,specializations,s12nlds,description" 
data = requests.get(url).json() 
# print(len(data['elements'])) 
print(data) 

with open("courserarough1.csv", 'a') as f: 
             ##### Header ##### 

    header = f.write(
     'instructors' + ',' + 'courseURL' + ',' + 'courseType' + ',' + 
    'CourseName' + ',' + 'partnerName' + ',' + 
     'slug' + ',' + 'specializations' + ',' + 'course_id' + ',' + 
    'description' + ',' + "\n") 


    for n in range(len(data['linked']['instructors.v1'])): 
     instructors = data['linked']['instructors.v1'][n]['fullName'] 
     instructors = str(instructors) 
     instructors = instructors.strip().replace(',', '') 

     instructorsid = data['linked']['instructors.v1'][n]['id'] 
     instructorsid = str(instructorsid) 
     instructorsid = instructorsid.strip().replace(',', '') 
     newdict = dict([(instructorsid,instructors)]) 
     print(newdict) 
    #print(data['linked']['instructors.v1']) 

    partnerlist = [] 
    for m in range(len(data['linked']['partners.v1'])): 
     partnerName = data['linked']['partners.v1'][m]['name'] 
     partnerName = str(partnerName) 
     partnerid = data['linked']['partners.v1'][m]['id'] 
     partnerid = str(partnerid) 

     partnerlist.append(partnerid) 

    #print(partnerlist) 


    for i in range(len(data['elements'])): 

     partnerIds = data['elements'][i]['partnerIds'] 
     #filtered = data[(np.where(partnerlist.__contains__(partnerIds)))] 
     #print(filtered) 
     courseType = data['elements'][i]['courseType'] 
     courseType = str(courseType) 
     if courseType: 
      courseType = courseType.rstrip().replace('v2.', '') 
     else: 
      courseType = ' ' 
     # print(courseType) 
     CourseName = data['elements'][i]['name'] 
     CourseName = str(CourseName) 

     CourseName = CourseName.strip().replace(',', '') 

     partnerIds = data['elements'][i]['partnerIds'] 
     partnerIds = str(partnerIds) 
     if partnerIds: 
      partnerIds = partnerIds.rstrip().replace(',', '') 
      partnerIds = partnerIds.rstrip().replace('\n', '') 
      partnerIds = partnerIds.rstrip().replace('u', '') 
      partnerIds = partnerIds.rstrip().replace('[', '') 
      partnerIds = partnerIds.rstrip().replace(']', '') 
     else: 
      partnerIds = ' ' 

     slug = data['elements'][i]['slug'] 
     slug = str(slug) 
     # print(slug) 
     specializations = data['elements'][i]['specializations'] 
     specializations = str(specializations) 
     if specializations: 
      specializations = specializations.rstrip().replace(',', '') 
      specializations = specializations.rstrip().replace('\n', '') 
      specializations = specializations.rstrip().replace('u', '') 
      specializations = specializations.rstrip().replace('[', '') 
      specializations = specializations.rstrip().replace(']', '') 
     else: 
      specializations = ' ' 

     course_id = data['elements'][i]['id'] 
     course_id = str(course_id) 

     description = data['elements'][i]['description'] 
     description = str(description) 

     if description: 
      description = description.strip().replace(',', '') 
      description = description.strip().replace('\n', '') 

     else: 
      description = ' ' 

     courseURL = "https://www.coursera.org/learn/" + slug 
     courseURL = str(courseURL) 

     instructorIds = data['elements'][i]['instructorIds'] 

     instructorIds = str(instructorIds) 
     if instructorIds: 
      instructorIds = instructorIds.rstrip().replace(',', '') 
      instructorIds = instructorIds.rstrip().replace('\n', '') 
      instructorIds = instructorIds.rstrip().replace('u', '') 
      instructorIds = instructorIds.rstrip().replace('[', '') 
      instructorIds = instructorIds.rstrip().replace(']', '') 

      instructorIds = re.sub(r'^"|"$', '', instructorIds) 
     else: 
      instructorIds = ' ' 
     #print(instructorIds) 

     instructors = newdict[instructorIds] 
     print(instructors) 






                   ##writing the 
     attributes in a csv file## 
     f.write(instructors + ',' + courseURL + ',' + courseType + ',' + 
     CourseName + ',' + partnerName + ',' + slug + 
      ',' + specializations + ',' + course_id + ',' + description + 
     "\n") 
+2

请告诉我们什么ü试图 – DineshKumar

+0

这样会更如果您还添加了用于抽取数据的代码,则会很有帮助。还要注意“instructor.v1”是一个字典数组,在'226710'中是一个值不是关键,关键是'id'。 –

+0

@AnisH_GuptA是的正是..所以这就是我创建另一个名为'newdict'的字典,我可以使用'226710'作为关键字,这样我就可以得到这个名字。 –

回答

1
import requests 
import json 
import re 
from bs4 import BeautifulSoup 

# url = "https://api.coursera.org/api/courses.v1?start=0&limit=20&includes=instructorIds,partnerIds,specializations,s12nlds,v1Details,v2Details&fields=instructorIds,partnerIds,specializations,s12nlds,description" 
url = "https://api.coursera.org/api/courses.v1?start=0&limit=20&includes=instructorIds&fields=instructorIds" # for brief I have shorten api call 
data = requests.get(url).text 
json_data = json.loads(data) 
cmp1 = json_data['elements'] 
cmp2 = json_data['linked']['instructors.v1'] 
for element in cmp1: 
    new_list = [] 
    for ids in element['instructorIds']: 
     for inst in cmp2: 
      new_dict = {} 
      if ids in inst['id']: 
       new_dict[ids] = inst['fullName'] 
       new_list.append(new_dict) 
    element['instructorIds'] = new_list 

print(cmp1) 
json_data['elements'] = cmp1 
with open('data.json', 'w', encoding='utf-8') as fp: # file data.json will generated in the directory from which you execute this script. assign full path to store json file to your desire place. 
    json.dump(json_data, fp, sort_keys=False, indent=4, ensure_ascii=False) 

一些样品JSON输出:

"elements": [ 
     { 
      "name": "Gamification", 
      "slug": "gamification", 
      "instructorIds": [ 
       { 
        "226710": "Kevin Werbach" 
       } 
      ], 
      "courseType": "v2.ondemand", 
      "id": "69Bku0KoEeWZtA4u62x6lQ" 
     }, 
     { 
      "name": "Dealing With Missing Data", 
      "slug": "missing-data", 
      "instructorIds": [ 
       { 
        "8394050": "Richard Valliant, Ph.D." 
       } 
      ], 
      "courseType": "v2.ondemand", 
      "id": "0HiU7Oe4EeWTAQ4yevf_oQ" 
     }, 
... 
... 
... 
... 
... 
     { 
      "name": "Accounting Analytics", 
      "slug": "accounting-analytics", 
      "instructorIds": [ 
       { 
        "1937011": "Brian J Bushee" 
       }, 
       { 
        "14757138": "Christopher D. Ittner" 
       } 
      ], 
      "courseType": "v2.ondemand", 
      "id": "rc5KG0aUEeWG1w6arGoEIQ" 
     }, 
     { 
      "name": "Municipal Solid Waste Management in Developing Countries", 
      "slug": "solid-waste-management", 
      "instructorIds": [ 
       { 
        "2387594": "Dr. Christian Zurbrügg" 
       }, 
       { 
        "7293234": "Imanol Zabaleta" 
       }, 
       { 
        "16974677": "Félix Schmidt" 
       } 
      ], 
      "courseType": "v2.ondemand", 
      "id": "gpAI9GK4EeWFkQ7sUCFGVQ" 
     }, 
... 
... 
... 
+0

谢谢。这有帮助! –