📝 Übung zum Aufbereiten von API-Daten#

Zum Lösungsnotebook

In diesem Übung geht es um das Laden von Daten von einer API sowie um die erste strukturierte Aufbereitung dieser Daten für nachfolgende Auswertungsansätze. Wir werden mit dem Beispiel aus Einführung in Web APIs weitermachen.

Diesmal ist es nicht die Aufgabe, Code zu schreiben, sondern die Herausforderunge besteht darin, fremden Code zu lesen.

Aufgabe: API-Daten abfragen, abrufen, aufbereiten#

  • Lesen und verstehen Sie den folgenden Code.

  • Ergänzen Sie die doc-Strings

  • Kommentieren Sie ggf. Inline die Code-Zeile, wenn Sie dies für das Verständnis für nötig halten.

  • Nutzen Sie Markdown-Zellen, um Teile des Codes an den Stellen, wo es nötig erscheint, genauer zu erläutern.

Import#

import requests
import json
import csv
import time
import urllib.parse
import re

Helferfunktionen#

def get_data_from_europeana_search_api(base_url, apikey, params):
    '''
    DOC-STRING
    '''

    api_search_url = base_url + \
                      'query=' + params['query'] + \
                      '&qf=LANGUAGE:' + params['language'] + \
                      '&profile=' + params['profile'] + \
                      '&hit.selectors=' + params['hit.selectors'] + \
                      '&sort=' + params['sort'] + \
                      '&rows=' + params['rows'] + \
                      '&cursor=' + params['cursor'] + \
                      '&wskey='+apikey
    
    response = requests.get(api_search_url)
    response_content = response.json()

    if response.status_code == 200:
      print('response status ok')
      print(response_content['params']) 
    else:
      print(response_content['error'] + '\n' + response_content['code']) 

    return response_content
def get_results_total(base_url, apikey, params):
    '''
    DOC-STRING
    '''

    cursor = True
    results_items_list = [] 
    results_hits_list = [] 

    while cursor == True:
        results = get_data_from_europeana_search_api(base_url, apikey, params)
        if 'nextCursor' in results:
            params['cursor'] = urllib.parse.quote(results['nextCursor']) 
            results_items_list.extend(results['items'])
            results_hits_list.extend(results['hits'])
            time.sleep(1)
        else:
            cursor = False
            
    return results_items_list, results_hits_list
def write_ids_in_text_file(results_list):
    '''
    DOC-STRING
    '''

    with open('europeana_search_result_item_ids.txt', 'a') as prozess_file:
        for i in range(0,len(results_list)):
            id = results_list[i]['id']
            prozess_file.write(id + '\n')
def write_data_in_csv_file(results_items_list, results_hits_list):
    '''
    DOC-STRING
    '''

    with open('newspaper_data.csv', 'w', encoding='utf-8') as csv_file:
      header = [
                'newspaper title',
                'id',
                'date',
                'data provider',
                'hit'
                ]
      text_writer = csv.DictWriter(csv_file, delimiter = ";", fieldnames = header)
      text_writer.writeheader()

      for i in range(0, len(results_items_list)): 
          title_and_date = results_items_list[i]['title'][0] 
          date = re.search(r'\d\d\d\d-\d\d-\d\d', title_and_date) 
          title = re.split(r' - \d\d\d\d-\d\d-\d\d', title_and_date) 
          date = date.group()
          title = title[0]
          hits = [] 

          for j in range(0, len(results_hits_list)): 
              if results_hits_list[j]['scope'] == results_items_list[i]['id']: 
                  selectors = results_hits_list[j]['selectors']
                  for sel in range(0, len(selectors)):
                      hit_sentence = '' 
                      if 'prefix' in selectors[sel]: 
                          hit_sentence += selectors[sel]['prefix']
                      if 'exact' in selectors[sel]:
                          hit_sentence += selectors[sel]['exact']
                      if 'suffix' in selectors[sel]:
                          hit_sentence += selectors[sel]['suffix']
                      hits.append(hit_sentence)

                  new_row = {
                            'newspaper title': title,
                            'id' : results_items_list[i]['id'],
                            'date': date,
                            'data provider': results_items_list[i]['dataProvider'][0],
                            'hit': hits
                            }
                  
                  text_writer.writerow(new_row)

Anfrage durchführen#

europeana_search_api_newspaper_url = 'https://newspapers.eanadev.org/api/v2/search.json?'
europeana_apikey = # '<YOUR_API_KEY>'

europeana_params = {'query':'Python', 
                    'language':'de', 
                    'profile':'hits+params', 
                    'hit.selectors':'5', 
                    'sort':'europeana_id+desc', 
                    'rows':'100', 
                    'cursor':'*'}
europeana_search_request_test = get_data_from_europeana_search_api(europeana_search_api_newspaper_url, 
                                                                   europeana_apikey, 
                                                                   europeana_params) 
if 'totalResults' in europeana_search_request_test:
    print(europeana_search_request_test['totalResults'])
    
europeana_entire_search = get_results_total(europeana_search_api_newspaper_url, 
                                            europeana_apikey, 
                                            europeana_params)

write_ids_in_text_file(europeana_entire_search[0])
write_data_in_csv_file(europeana_entire_search[0], 
                       europeana_entire_search[1])