Source code for access_eval.bin.get_sentiment_for_landing_content

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import logging
import sys
import time
import traceback
from functools import partial
from pathlib import Path

import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from selenium.webdriver.common.by import By
from textblob import TextBlob
from tqdm import tqdm

###############################################################################

logging.basicConfig(
    level=logging.INFO,
    format="[%(levelname)4s: %(module)s:%(lineno)4s %(asctime)s] %(message)s",
)
log = logging.getLogger(__name__)

###############################################################################


[docs]class Args(argparse.Namespace): def __init__(self) -> None: self.__parse() def __parse(self) -> None: p = argparse.ArgumentParser( prog="get-sentiment-for-landing-page-content", description=( "For each campaign website in the provided dataset, " "load the page, retrieve all text content, " "and compute sentiment with spacy." ), ) p.add_argument( "dataset", type=str, help="The path to the dataset as a CSV to use.", ) p.add_argument( "-c", "--url_column", type=str, default="campaign_website_url", help=( "The column name in the dataset which contains " "the campaign website URLs." ), ) p.parse_args(namespace=self)
############################################################################### def _process_url(row: pd.Series, url_column: str) -> float: try: # Create new firefox headless browser opts = FirefoxOptions() opts.add_argument("--headless") url = row[url_column] with webdriver.Firefox(firefox_options=opts) as driver: log.debug(f"Starting page load for: '{url}'.") # Start page load driver.get(url) log.debug("Sleeping for 2 seconds.") # Wait for all page content time.sleep(2) # Get all text log.debug("Getting all page text.") page_text = driver.find_element(By.XPATH, "/html/body").text # Get sentiment log.debug("Processing text to get sentiment.") blob = TextBlob(page_text) row["polarity"] = blob.polarity row["subjectivity"] = blob.subjectivity except Exception: log.error(f"Errored while processing: '{url}'") row["polarity"] = np.nan row["subjectivity"] = np.nan return row def _process_dataset(dataset: str, url_column: str = "campaign_website_url") -> str: # Create process function partial process_func = partial(_process_url, url_column=url_column) # Load the dataset df = pd.read_csv(dataset) # Apply tqdm pandas tqdm.pandas() # Progress apply df = df.progress_apply(process_func, axis=1) # Store to new dataset original_path = Path(dataset).resolve() new_path = original_path.with_name(f"{original_path.stem}-with-sentiment.csv") df.to_csv(new_path, index=False)
[docs]def main() -> None: try: args = Args() _process_dataset(args.dataset, args.url_column) except Exception as e: log.error("=============================================") log.error("\n\n" + traceback.format_exc()) log.error("=============================================") log.error("\n\n" + str(e) + "\n") log.error("=============================================") sys.exit(1)
############################################################################### # Allow caller to directly run this module (usually in development scenarios) if __name__ == "__main__": main()