Importer/exporter les données

Bases


Paramètres courants


CSV

Documentation: pandas.read_csv


TXT

Pour charger un TXT qui n’est structuré correctement — par exemple où les colonnes ne sont pas toutes ordonnées de la même manière d’une ligne à l’autre — alors il faut charger le fichier manuellement et structurer les données avant de charger le DataFrame.

import csv

with open('certificates.csv', 'r') as fp:
    reader = csv.reader(fp, delimiter=',')
    next(reader)
    for index, values in enumerate(reader):
        name, certs_num, months_num = values
        print(f"{name} earned {certs_num} certificates in {months_num} months")

Excel

Documentation: pandas.read_excel


JSON

df = pd.read_json(filepath_or_url)

Pour récupérer un JSON distant qui nécessite des headers/cookies personnalisés ou qui n’est au bon format (ex dans une sous-clé), utiliser requests

import requests

def get_historic_price(
  symbol,
  exchange='bitfinex',
  after='2018-09-01'
):
    url = 'https://api.cryptowat.ch/markets/{exchange}/{symbol}usd/ohlc'.format(
      symbol=symbol,
      exchange=exchange
    )

    # Get JSON
    resp = requests.get(url, params={
        'periods': '3600',
        'after': str(int(pd.Timestamp(after).timestamp()))
    })
    resp.raise_for_status()
    data = resp.json()

    # Create DataFrame from JSON
    df = pd.DataFrame(
        data=data['result']['3600'],
        columns=[
          'CloseTime',
          'OpenPrice',
          'HighPrice',
          'LowPrice',
          'ClosePrice',
          'Volume',
          'NA'
    ])
    df['CloseTime'] = pd.to_datetime(df['CloseTime'], unit='s')
    df.set_index('CloseTime', inplace=True)

    return df

last_week = (pd.Timestamp.now() - pd.offsets.Day(7))

df_btc = get_historic_price('btc', 'bitstamp', after=last_week)
bf_eth = get_historic_price('eth', 'bitstamp', after=last_week)

Table HTML

Documentation: pandas.read_html


SQLite3

import sqlite3

_file = '../input/pitchfork-data/database.sqlite'
query = 'SELECT * FROM artists'

conn  = sqlite3.connect(_file)
df    = pd.read_sql_query(query, conn)
conn.close()
from sqlalchemy import create_engine

engine = create_engine('sqlite:///chinook.db')
conn   = engine.connect()
df     = pd.read_sql_table('employees', con=conn)

SQL

from sqlalchemy import create_engine

engine = create_engine("postgresql+pg8000://YOUR_USERNAME:YOUR_PASSWORD@localhost/YOUR_DATABASE")
conn   = engine.connect()

df = pd.read_sql("SELECT * FROM stocks.stock_dim", conn)
df.to_sql('housing_table', con=connection, schema='public', if_exists='replace')

Parquet

df = pd.read_parquet('../input/feature-engineering-data/baseline_data.pqt')

Matlab

from scipy.io import loadmat

mat = loadmat('cardio.mat')
df  = pd.DataFrame(np.hstack((mat['X'], mat['y'])))

Pickle

df.to_pickle('corpus.pkl')
df = pd.read_pickle('corpus.pkl')