stefanuddenberg
10/2/2019 - 1:06 AM

Pandas -- Get intersection of two DataFrames on a column

# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.4'
#       jupytext_version: 1.2.4
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# # Testing correlating training and test data

import pandas as pd
from random import shuffle

# # Create dummy data

# +
training_data = pd.DataFrame({
    'trustworthiness': [3, 5, 4, 6, 8, 10, 9, 1, 2, 7],
    'amount': [10, 0, 30, 50, 20, 40, 0, 5, 12, 9]
})

test_data = pd.DataFrame({
    'trustworthiness': [1, 7, 2, 21, 22, 23],
    'amount': [6, 10, 13, 50, 20, 40]
})

training_data
# -

# # Get overlap of training and test data (sorted)

# +
training_data.sort_values(by=["trustworthiness"], inplace=True)
test_data.sort_values(by=["trustworthiness"], inplace=True)

intersection = pd.merge(training_data, test_data, on="trustworthiness", how='inner')
intersection
# -

# # Get correlation

intersection["amount_x"].corr(intersection["amount_y"])