📝 Übung zum Aufbereiten von API-Daten

Contents

📝 Übung zum Aufbereiten von API-Daten#

In dieser Übung geht es um das Laden von Daten von einer API sowie um die erste strukturierte Aufbereitung dieser Daten für nachfolgende Auswertungsansätze. Wir werden mit dem Beispiel aus Einführung in Web APIs weitermachen.

Diesmal ist es nicht die Aufgabe, Code zu schreiben, sondern die Herausforderunge besteht darin, fremden Code zu lesen.

Aufgabe: API-Daten abfragen, abrufen, aufbereiten#

Lesen und verstehen Sie den folgenden Code.
Ergänzen Sie die doc-Strings
Kommentieren Sie ggf. Inline die Code-Zeile, wenn Sie dies für das Verständnis für nötig halten.
Nutzen Sie Markdown-Zellen, um Teile des Codes an den Stellen, wo es nötig erscheint, genauer zu erläutern.

Import#

import requests
import json
import csv
import time
import urllib.parse
import re

Helferfunktionen#

def get_data_from_europeana_search_api(base_url, apikey, params):
    '''
    Fetches data from the Europeana Search API using specified parameters.

    Args:
        base_url (str): The base URL for the Europeana Search API.
        apikey (str): The API key to authenticate the request.
        params (dict): A dictionary containing the following keys:
            - 'query' (str): The search query string.
            - 'language' (str): The language filter for the search results.
            - 'profile' (str): The profile specifying the level of detail in the search results.
            - 'hit.selectors' (str): The fields to include in the search results.
            - 'sort' (str): The sorting order for the search results.
            - 'rows' (str): The number of results to return.
            - 'cursor' (str): The cursor for pagination.

    Returns:
        dict: The JSON response content from the Europeana Search API.

    Raises:
        Exception: If the API request fails, prints the error message and code returned by the Europeana API.
    
    '''

    api_search_url = base_url + \
                      'query=' + params['query'] + \
                      '&qf=LANGUAGE:' + params['language'] + \
                      '&profile=' + params['profile'] + \
                      '&hit.selectors=' + params['hit.selectors'] + \
                      '&sort=' + params['sort'] + \
                      '&rows=' + params['rows'] + \
                      '&cursor=' + params['cursor'] + \
                      '&wskey='+apikey
    
    response = requests.get(api_search_url)
    response_content = response.json()

    if response.status_code == 200:
      print('response status ok')
      print(response_content['params']) # Zeige Parameter, die für Request verarbeitet wurden
    else:
      print(response_content['error'] + '\n' + response_content['code']) # Wenn Request fehlschlägt, dann zeige von Europeana gelieferte Fehlermeldung und -code

    return response_content

def get_results_total(base_url, apikey, params):
    '''
    Retrieves the total set of results from the Europeana Search API by iterating through paginated results.

    Args:
        base_url (str): The base URL for the Europeana Search API.
        apikey (str): The API key to authenticate the request.
        params (dict): A dictionary containing the parameters for the API request, including:
            - 'query' (str): The search query string.
            - 'language' (str): The language filter for the search results.
            - 'profile' (str): The profile specifying the level of detail in the search results.
            - 'hit.selectors' (str): The fields to include in the search results.
            - 'sort' (str): The sorting order for the search results.
            - 'rows' (str): The number of results to return.
            - 'cursor' (str): The cursor for pagination.

    Returns:
        tuple: A tuple containing two lists:
            - results_items_list: A list of item subtrees from the API responses.
            - results_hits_list: A list of hit subtrees from the API responses.

    Raises:
        Exception: If an error occurs during the API request, the function from `get_data_from_europeana_search_api` will handle it.
    
    '''

    cursor = True
    results_items_list = [] # Speichern des items-Teilbaums in separater Liste
    results_hits_list = [] # Speichern des hits-Teilbaums in separater Liste

    while cursor == True:
        results = get_data_from_europeana_search_api(base_url, apikey, params)
        if 'nextCursor' in results:
            params['cursor'] = urllib.parse.quote(results['nextCursor']) # urlescape nextCursor (siehe Doku)
            results_items_list.extend(results['items'])
            results_hits_list.extend(results['hits'])
            time.sleep(1)
        else:
            cursor = False
            
    return results_items_list, results_hits_list

def write_ids_in_text_file(results_list):
    '''
    Writes the IDs of digital objects from the results list into a text file.

    Args:
        results_list (list): A list of result dictionaries, where each dictionary contains an 'id' key representing the ID of a digital object.

    Returns:
        None

    Side Effects:
        Appends the IDs to a text file named 'europeana_search_result_item_ids.txt', each ID on a new line.
    
    '''

    with open('europeana_search_result_item_ids.txt', 'a') as prozess_file:
        for i in range(0,len(results_list)):
            id = results_list[i]['id'] # Knoten, in dem Id des digitalen Objekts gespeichert wird
            prozess_file.write(id + '\n')

def write_data_in_csv_file(results_items_list, results_hits_list):
    '''
    Writes detailed data from the results lists into a CSV file.

    Args:
        results_items_list (list): A list of dictionaries containing information about newspaper items.
        results_hits_list (list): A list of dictionaries containing hit information related to the search query.

    Returns:
        None

    Side Effects:
        Creates and writes to a CSV file named 'newspaper_data.csv', storing information such as newspaper title, ID, date, data provider, and search hits.
    
    '''

    with open('newspaper_data.csv', 'w', encoding='utf-8') as csv_file:
      header = [
                'newspaper title',
                'id',
                'date',
                'data provider',
                'hit'
                ]
      text_writer = csv.DictWriter(csv_file, delimiter = ";", fieldnames = header)
      text_writer.writeheader()

      for i in range(0, len(results_items_list)): # Im items-Teilbaum befinden sich die Informationen zu Titel, Datum, Datenlieferant
          title_and_date = results_items_list[i]['title'][0] # Titel und Datum werden in einem Knoten gespeichert, Daten müssen für die separate Speicherung getrennt werden
          date = re.search(r'\d\d\d\d-\d\d-\d\d', title_and_date) # Das Datum ist immer nach dem Muster yyyy-mm-dd gespeichert.
          title = re.split(r' - \d\d\d\d-\d\d-\d\d', title_and_date) # Extrahieren des Titels, der immer vor dem Datum steht.
          date = date.group()
          title = title[0]
          hits = [] # Alle Treffer zu einem Objekt in einer Liste, ansonsten leere Liste

          for j in range(0, len(results_hits_list)): # Im hits-Teilbaum befindet sich die Information zum Suchwort innerhalb des Textes (mit ganzem Satz)
              if results_hits_list[j]['scope'] == results_items_list[i]['id']: # Zuordnung des Treffers (hit) zu seinem Titel, Datum, und Datenlieferant über Id-Vergleich
                  selectors = results_hits_list[j]['selectors']
                  for sel in range(0, len(selectors)):
                      hit_sentence = '' # Concate separate prefix, exact und suffix
                      if 'prefix' in selectors[sel]: # Bedingung ist notwendig, da es vorkommen kann, dass ein Knoten nicht existiert (meistens prefix). Auffangen der Fehlermeldung
                          hit_sentence += selectors[sel]['prefix']
                      if 'exact' in selectors[sel]:
                          hit_sentence += selectors[sel]['exact']
                      if 'suffix' in selectors[sel]:
                          hit_sentence += selectors[sel]['suffix']
                      hits.append(hit_sentence)

                  new_row = {
                            'newspaper title': title,
                            'id' : results_items_list[i]['id'],
                            'date': date,
                            'data provider': results_items_list[i]['dataProvider'][0],
                            'hit': hits
                            }
                  
                  text_writer.writerow(new_row)

Anfrage durchführen#

europeana_search_api_newspaper_url = 'https://newspapers.eanadev.org/api/v2/search.json?'
europeana_apikey = # '<YOUR_API_KEY>'

europeana_params = {'query':'Pyhton', 
                    'language':'de', 
                    'profile':'hits+params', 
                    'hit.selectors':'5', 
                    'sort':'europeana_id+desc', 
                    'rows':'100', 
                    'cursor':'*'}

europeana_search_request_test = get_data_from_europeana_search_api(europeana_search_api_newspaper_url, 
                                                                   europeana_apikey, 
                                                                   europeana_params) # Testabfrage, um wie viele Datensätze es sich handelt
if 'totalResults' in europeana_search_request_test:
    print(europeana_search_request_test['totalResults'])
    
europeana_entire_search = get_results_total(europeana_search_api_newspaper_url, 
                                            europeana_apikey, 
                                            europeana_params)

write_ids_in_text_file(europeana_entire_search[0])
write_data_in_csv_file(europeana_entire_search[0], 
                       europeana_entire_search[1])