#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
import altair as alt
import pandas as pd
from .constants import ComputedFields, DatasetFields
from .core import flatten_access_eval_2021_dataset, load_access_eval_2021_dataset
###############################################################################
PLOTTING_DIR = Path("plots/").resolve()
###############################################################################
[docs]def plot_computed_fields_over_vote_share(
data: Optional[pd.DataFrame] = None,
save_path: Optional[Union[str, Path]] = None,
) -> Path:
# Load default data
if data is None:
data = load_access_eval_2021_dataset()
# Apply default save path
if save_path is None:
save_path = PLOTTING_DIR / "vote-share.png"
# Ensure save path is Path object
save_path = Path(save_path).resolve()
save_path.parent.mkdir(parents=True, exist_ok=True)
# Generate chart
vote_share = (
alt.Chart(data)
.mark_point()
.encode(
alt.X(f"{DatasetFields.vote_share}:Q"),
alt.Y(alt.repeat("column"), type="quantitative"),
color=f"{DatasetFields.contacted}:N",
shape=f"{DatasetFields.contacted}:N",
)
.repeat(
column=[
ComputedFields.diff_errors.name,
ComputedFields.diff_critical_errors.name,
ComputedFields.diff_serious_errors.name,
ComputedFields.diff_moderate_errors.name,
ComputedFields.diff_minor_errors.name,
ComputedFields.avg_errors_per_page_pre.name,
ComputedFields.avg_errors_per_page_post.name,
ComputedFields.avg_critical_errors_per_page_pre.name,
ComputedFields.avg_critical_errors_per_page_post.name,
ComputedFields.avg_serious_errors_per_page_pre.name,
ComputedFields.avg_serious_errors_per_page_post.name,
ComputedFields.avg_moderate_errors_per_page_pre.name,
ComputedFields.avg_moderate_errors_per_page_post.name,
ComputedFields.avg_minor_errors_per_page_pre.name,
ComputedFields.avg_minor_errors_per_page_post.name,
],
)
)
vote_share.save(str(save_path.resolve()))
return save_path
[docs]def plot_pre_post_fields_compare(
data: Optional[pd.DataFrame] = None,
save_path: Optional[Union[str, Path]] = None,
) -> Path:
# Load default data
if data is None:
data = load_access_eval_2021_dataset()
# Apply default save path
if save_path is None:
save_path = PLOTTING_DIR / "pre-post.png"
# Ensure save path is Path object
save_path = Path(save_path).resolve()
save_path.parent.mkdir(parents=True, exist_ok=True)
pre_post = alt.hconcat()
for pre, post in [
(
ComputedFields.avg_errors_per_page_pre.name,
ComputedFields.avg_errors_per_page_post.name,
),
(
ComputedFields.avg_critical_errors_per_page_pre.name,
ComputedFields.avg_critical_errors_per_page_post.name,
),
(
ComputedFields.avg_serious_errors_per_page_pre.name,
ComputedFields.avg_serious_errors_per_page_post.name,
),
(
ComputedFields.avg_moderate_errors_per_page_pre.name,
ComputedFields.avg_moderate_errors_per_page_post.name,
),
(
ComputedFields.avg_minor_errors_per_page_pre.name,
ComputedFields.avg_minor_errors_per_page_post.name,
),
]:
pre_post |= (
alt.Chart(data)
.mark_point()
.encode(
x=f"{post}:Q",
y=f"{pre}:Q",
color=f"{DatasetFields.contacted}:N",
shape=f"{DatasetFields.contacted}:N",
)
)
pre_post.save(str(save_path.resolve()))
return save_path
[docs]def plot_categorical_against_errors_boxplots(
data: Optional[pd.DataFrame] = None,
) -> List[Path]:
"""
Input data should be the "flattened" dataset.
"""
# Load default data
if data is None:
data = flatten_access_eval_2021_dataset()
# Only work against the post data for summary stats as there was no difference
# pre and post (trial / contact)
data = data[data[DatasetFields.trial] == "B - Post"]
# Set of categorical variables to use for box plot generation
categorical_variables = [
DatasetFields.electoral_position,
DatasetFields.candidate_position,
DatasetFields.election_result,
]
# For each categorical variable, create a row of the different error measures
save_paths = []
for cat_var in categorical_variables:
# Break down the categorical variable into all errors and subsets of error type
error_types = alt.hconcat()
for err in [
ComputedFields.avg_errors_per_page_post.name,
ComputedFields.avg_minor_errors_per_page_post.name,
ComputedFields.avg_moderate_errors_per_page_post.name,
ComputedFields.avg_serious_errors_per_page_post.name,
ComputedFields.avg_critical_errors_per_page_post.name,
]:
feature_name = err.replace("_post", "")
scale_name = ComputedFields.avg_errors_per_page_post.name.replace(
"_post", ""
)
error_types |= (
alt.Chart(data)
.mark_boxplot(ticks=True)
.encode(
y=alt.Y(
f"{feature_name}:Q",
scale=alt.Scale(
domain=(
data[scale_name].min(),
data[scale_name].max(),
),
padding=1,
),
),
column=alt.Column(
f"{cat_var}:N", spacing=40, header=alt.Header(orient="bottom")
),
)
)
save_path = PLOTTING_DIR / f"{cat_var}-errors-split.png"
save_path.parent.mkdir(parents=True, exist_ok=True)
error_types.save(str(save_path))
save_paths.append(save_path)
return save_paths
[docs]def plot_locations_against_errors_boxplots(
data: Optional[pd.DataFrame] = None,
) -> Path:
"""
Input data should be the "flattened" dataset.
"""
# Load default data
if data is None:
data = flatten_access_eval_2021_dataset()
# Only work against the post data for summary stats as there was no difference
# pre and post (trial / contact)
data = data[data[DatasetFields.trial] == "B - Post"]
# Drop any locations with less than two campaigns
location_counts = data[DatasetFields.location].value_counts()
viable_locations = location_counts[location_counts < 2]
data = data[~data[DatasetFields.location].isin(viable_locations)]
location_plots = alt.vconcat()
for location in data[DatasetFields.location].unique():
location_subset = data.loc[data[DatasetFields.location] == location]
if len(location_subset) > 4:
error_types = alt.hconcat()
for err in [
ComputedFields.avg_errors_per_page_post.name,
ComputedFields.avg_minor_errors_per_page_post.name,
ComputedFields.avg_moderate_errors_per_page_post.name,
ComputedFields.avg_serious_errors_per_page_post.name,
ComputedFields.avg_critical_errors_per_page_post.name,
]:
feature_name = err.replace("_post", "")
scale_name = ComputedFields.avg_errors_per_page_post.name.replace(
"_post", ""
)
error_types |= (
alt.Chart(location_subset)
.mark_boxplot(ticks=True)
.encode(
y=alt.Y(
f"{feature_name}:Q",
scale=alt.Scale(
domain=(
data[scale_name].min(),
data[scale_name].max(),
),
padding=1,
),
),
column=alt.Column(
f"{DatasetFields.candidate_position}:N",
spacing=60,
header=alt.Header(orient="bottom"),
),
)
)
location_plots &= error_types
save_path = PLOTTING_DIR / "location-errors-split.png"
save_path.parent.mkdir(parents=True, exist_ok=True)
location_plots.save(str(save_path))
return save_path
[docs]def plot_error_types_boxplots(
data: Optional[pd.DataFrame] = None,
) -> Path:
"""
Input data should be the "flattened" dataset.
"""
# Load default data
if data is None:
data = flatten_access_eval_2021_dataset()
# Only work against the post data for summary stats as there was no difference
# pre and post (trial / contact)
data = data[data[DatasetFields.trial] == "B - Post"]
# Use all pre-computed avg error type features
common_error_cols = [col for col in data.columns if "avg_error-type_" in col]
# Create plot
err_type_plots = alt.vconcat()
for err_type in common_error_cols:
cat_var_plot = alt.hconcat()
for cat_var in [
DatasetFields.electoral_position,
DatasetFields.candidate_position,
DatasetFields.election_result,
]:
cat_var_plot |= (
alt.Chart(data)
.mark_boxplot(ticks=True)
.encode(
y=alt.Y(
f"{err_type}:Q",
scale=alt.Scale(
domain=(
data[err_type].min(),
data[err_type].max(),
),
padding=1,
),
),
column=alt.Column(
f"{cat_var}:N", spacing=60, header=alt.Header(orient="bottom")
),
)
)
err_type_plots &= cat_var_plot
save_path = PLOTTING_DIR / "error-types-by-category-splits.png"
save_path.parent.mkdir(parents=True, exist_ok=True)
err_type_plots.save(str(save_path))
return save_path
def _plot_and_fig_text(
data: pd.DataFrame,
plot_cols: List[str],
fig_text_prefix: str,
subset_name: str,
column: Optional[alt.Column] = None,
consistent_scale: bool = False,
) -> None:
if consistent_scale:
scale_min = min([data[col].min() for col in plot_cols])
scale_max = max([data[col].max() for col in plot_cols])
scale = alt.Scale(
domain=(scale_min, scale_max),
padding=1,
)
else:
scale = alt.Scale()
chart = alt.hconcat(spacing=40)
for col in plot_cols:
if column is None:
chart |= (
alt.Chart(data)
.mark_boxplot()
.encode(
y=alt.Y(
col,
scale=scale,
)
)
)
else:
chart |= (
alt.Chart(data)
.mark_boxplot()
.encode(
y=alt.Y(
col,
scale=scale,
),
column=column,
)
)
fig_text_prefix += (
f" {col} "
f"mean: {round(data[col].mean(), 2)}, "
f"std: {round(data[col].std(), 2)}, "
f"min: {round(data[col].min(), 2)}, "
f"max: {round(data[col].max(), 2)}."
)
chart.properties(title="Campaign Website Content")
# Save fig and text
fig_save_path = PLOTTING_DIR / f"{subset_name}.png"
fig_save_path.parent.mkdir(parents=True, exist_ok=True)
chart.save(str(fig_save_path))
with open(fig_save_path.with_suffix(".txt"), "w") as open_f:
open_f.write(fig_text_prefix)
[docs]def plot_summary_stats(
data: Optional[pd.DataFrame] = None,
subset_name: str = "",
keep_cols: List[str] = [],
plot_kwargs: Dict[str, Any] = {},
) -> None:
"""
Input data should be the "flattened" dataset.
"""
# Load default data
if data is None:
data = flatten_access_eval_2021_dataset()
# Only work against the post data for summary stats as there was no difference
# pre and post (trial / contact)
data = data[data[DatasetFields.trial] == "B - Post"]
# Split into different commonly grouped stats
# Content is the actual website content
content_cols = [
DatasetFields.number_of_pages_post.replace("_post", ""),
DatasetFields.ease_of_reading,
DatasetFields.number_of_words,
DatasetFields.number_of_unique_words,
]
# Error count norm stats
error_counts_normed_cols = [
c.replace("_post", "")
for c in [
ComputedFields.avg_errors_per_page_post.name,
ComputedFields.avg_minor_errors_per_page_post.name,
ComputedFields.avg_moderate_errors_per_page_post.name,
ComputedFields.avg_serious_errors_per_page_post.name,
ComputedFields.avg_critical_errors_per_page_post.name,
]
]
# Error types are the actual error value (what was the error)
error_types_cols = [c for c in data.columns if "avg_error-type_" in c]
# Create content plots
_plot_and_fig_text(
data=data[[*content_cols, *keep_cols]],
plot_cols=content_cols,
fig_text_prefix=(
"Distributions for key content statistics "
"gathered while scraping campaign websites."
),
subset_name=f"{subset_name}content-stats",
**plot_kwargs,
)
# Create norm stats plots
_plot_and_fig_text(
data=data[[*error_counts_normed_cols, *keep_cols]],
plot_cols=error_counts_normed_cols,
fig_text_prefix=(
"Distributions for normalized error severity counts "
"(counts for each error severity / number of pages) "
"statistics gathered from scraping campaign websites."
),
subset_name=f"{subset_name}error-severity",
consistent_scale=True,
**plot_kwargs,
)
# Create error types plots
_plot_and_fig_text(
data=data[[*error_types_cols, *keep_cols]],
plot_cols=error_types_cols,
fig_text_prefix=(
"Distributions for normalized error types counts "
"(counts for each error type / number of pages) "
"statistics gathered from scraping campaign websites."
),
subset_name=f"{subset_name}error-types",
consistent_scale=True,
**plot_kwargs,
)
[docs]def plot_location_based_summary_stats(
data: Optional[pd.DataFrame] = None,
) -> None:
"""
Input data should be the "flattened" dataset.
"""
# Load default data
if data is None:
data = flatten_access_eval_2021_dataset()
# Only work against the post data for summary stats as there was no difference
# pre and post (trial / contact)
data = data[data[DatasetFields.trial] == "B - Post"]
# Drop any locations with less than two campaigns
location_counts = data[DatasetFields.location].value_counts()
viable_locations = location_counts[location_counts <= 2].index
data = data[~data[DatasetFields.location].isin(viable_locations)]
# Plot basic stats
plot_summary_stats(
data,
subset_name="location-split-",
keep_cols=[DatasetFields.location],
plot_kwargs={"column": alt.Column(DatasetFields.location, spacing=60)},
)
[docs]def plot_election_result_based_summary_stats(
data: Optional[pd.DataFrame] = None,
) -> None:
"""
Input data should be the "flattened" dataset.
"""
# Load default data
if data is None:
data = flatten_access_eval_2021_dataset()
# Only work against the post data for summary stats as there was no difference
# pre and post (trial / contact)
data = data[data[DatasetFields.trial] == "B - Post"]
# Plot basic stats
plot_summary_stats(
data,
subset_name="election-result-split-",
keep_cols=[DatasetFields.election_result],
plot_kwargs={"column": alt.Column(DatasetFields.election_result, spacing=40)},
)
[docs]def plot_electoral_position_based_summary_stats(
data: Optional[pd.DataFrame] = None,
) -> None:
"""
Input data should be the "flattened" dataset.
"""
# Load default data
if data is None:
data = flatten_access_eval_2021_dataset()
# Only work against the post data for summary stats as there was no difference
# pre and post (trial / contact)
data = data[data[DatasetFields.trial] == "B - Post"]
# Plot basic stats
plot_summary_stats(
data,
subset_name="election-position-split-",
keep_cols=[DatasetFields.electoral_position],
plot_kwargs={
"column": alt.Column(DatasetFields.electoral_position, spacing=40)
},
)
[docs]def plot_candidate_position_based_summary_stats(
data: Optional[pd.DataFrame] = None,
) -> None:
"""
Input data should be the "flattened" dataset.
"""
# Load default data
if data is None:
data = flatten_access_eval_2021_dataset()
# Only work against the post data for summary stats as there was no difference
# pre and post (trial / contact)
data = data[data[DatasetFields.trial] == "B - Post"]
# Plot basic stats
plot_summary_stats(
data,
subset_name="candidate-position-split-",
keep_cols=[DatasetFields.candidate_position],
plot_kwargs={
"column": alt.Column(DatasetFields.candidate_position, spacing=40)
},
)
[docs]def plot_pre_post_errors(
data: Optional[pd.DataFrame] = None,
) -> None:
"""
Input data should be the "flattened" dataset.
"""
# Load default data
if data is None:
data = flatten_access_eval_2021_dataset()
# Make pre post chart with split by contacted
chart = (
alt.Chart(data)
.mark_boxplot()
.encode(
x=DatasetFields.contacted,
y=f"{ComputedFields.avg_errors_per_page_post.name.replace('_post', ''):}:Q",
column=alt.Column(DatasetFields.trial, spacing=30),
color=DatasetFields.contacted,
)
)
# Save
PLOTTING_DIR.mkdir(parents=True, exist_ok=True)
chart.save(str(PLOTTING_DIR / "pre-post-errors.png"))