top of page

Q&A, Comments, Suggestions

Public·1 member

Regular Expressions (RegEx) - comments

A very insightful post @jasoleramos, thank you for sharing. I agree that RegEx analysis can be an extremely useful tool in identifying patterns within recurring document types in NLP applications. One example being in SEC 10-K Filings, where an analyst can leverage RegEx patterns to extract key financial data or information contained within 10-K filings in a systematic way, allowing to analyse 1000s of fillings at once in a seamless manner. Here's another example of RegEx analysis at work in the 10-K Filings example mentioned with the aim of extracting Items 1A, Item 7 and Item 7A from the filings as they are strategic for sentiment analysis, these sections are traditionally rich in information related to a company’s risk factors, management’s discussion of financial condition, and qualitative disclosures about market risk:


------------------------

import pandas as pd

import numpy as np

import time

import random

import requests

import re

from bs4 import BeautifulSoup


# request function

def send_request(url):

user_agents = [

"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",

"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",

]

headers = {'User-Agent': random.choice(user_agents)}

response = requests.get(url, headers=headers)

time.sleep(random.uniform(1, 3))

return response


# get filing data

def get_filing_html(cik_info, accession_number, filing_type, filing_date):

if filing_type != '10-K': # skip non-10-K

return None


url = f"https://www.sec.gov/Archives/edgar/data/{cik_info['cik']}/{accession_number.replace('-', '')}/{accession_number}.txt"


try:

response = send_request(url)

response.raise_for_status()

doc_text = response.text # extract text


return {

'CIK': cik_info['cik'],

'Ticker': cik_info['tickers'][0] if cik_info.get('tickers') else 'N/A', # get ticker

'Accession Number': accession_number,

'SIC Description': cik_info.get('sicDescription', 'N/A'), # get SIC

'Text Document': doc_text, # store text

'Filing Type': filing_type,

'Filing Date': filing_date

}


except requests.exceptions.RequestException as e:

print(f"Error: {e}")

return None


def extract_sentences(raw_10k):

# Regex patterns

doc_start_pattern = re.compile(r'<DOCUMENT>')

doc_end_pattern = re.compile(r'</DOCUMENT>')

type_pattern = re.compile(r'<TYPE>[^\n]+')

item_regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')


# finding document start and end indices

doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]

doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]


document = {}


# looping through each section type and save only the '10-K' section in dictionary

for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):

if doc_type == '10-K':

document[doc_type] = raw_10k[doc_start:doc_end]


# finding matches using regex in '10-K' section

matches = item_regex.finditer(document.get('10-K', ''))


# creating a df from matches

item_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches], columns=['item', 'start', 'end'])

item_df['item'] = item_df['item'].str.lower()


# cleaning up unnecessary characters

item_df.replace('&#160;', ' ', regex=True, inplace=True)

item_df.replace('&nbsp;', ' ', regex=True, inplace=True)

item_df.replace(' ', '', regex=True, inplace=True)

item_df.replace('\.', '', regex=True, inplace=True)

item_df.replace('>', '', regex=True, inplace=True)


# dropping duplicates

pos_dat = item_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')


# extracting content for each item

item_1a_raw = ''

item_7_raw = ''

item_7a_raw = ''


if 'item1a' in pos_dat['item'].values:

item_1a_start = pos_dat.loc[pos_dat['item'] == 'item1a', 'start'].values

item_1b_start = pos_dat.loc[pos_dat['item'] == 'item1b', 'start'].values

if item_1a_start.size > 0 and item_1b_start.size > 0:

item_1a_raw = document.get('10-K', '')[item_1a_start[0]:item_1b_start[0]]


if 'item7' in pos_dat['item'].values:

item_7_start = pos_dat.loc[pos_dat['item'] == 'item7', 'start'].values

item_7a_start = pos_dat.loc[pos_dat['item'] == 'item7a', 'start'].values

if item_7_start.size > 0 and item_7a_start.size > 0:

item_7_raw = document.get('10-K', '')[item_7_start[0]:item_7a_start[0]]


if 'item7a' in pos_dat['item'].values:

item_7a_start = pos_dat.loc[pos_dat['item'] == 'item7a', 'start'].values

item_8_start = pos_dat.loc[pos_dat['item'] == 'item8', 'start'].values

if item_7a_start.size > 0 and item_8_start.size > 0:

item_7a_raw = document.get('10-K', '')[item_7a_start[0]:item_8_start[0]]


# extracting sentences from BeautifulSoup objects

item_1a_content = BeautifulSoup(item_1a_raw, 'lxml').get_text()

item_7_content = BeautifulSoup(item_7_raw, 'lxml').get_text()

item_7a_content = BeautifulSoup(item_7a_raw, 'lxml').get_text()


# combining sentences from different items

all_sentences = f"{item_1a_content} {item_7_content} {item_7a_content}"


return all_sentences


# example usage

cik_info = {

'cik': '0000320193', # example CIK

'tickers': ['AAPL'], # example ticker

'sicDescription': 'Electronic Computers [3571]'

}


accession_number = '0001193125-11-282113'

filing_type = '10-K'

filing_date = '2011-10-26'


# get filing data

filing_data = get_filing_html(cik_info, accession_number, filing_type, filing_date)


if filing_data:

filing_data['Sentences'] = extract_sentences(filing_data['Text Document'])

print(pd.DataFrame([filing_data])[['CIK', 'Ticker', 'Filing Date', 'SIC Description', 'Sentences']])

16 Views
JA Soler
JA Soler
Jan 14

Thank you very much, Max, for participating in the forum and for being a member of CuriousAI.net. I completely agree that RegEx is incredibly useful for extracting key financial data from financial reports like the 10-K. The Python solution you’ve shared for extracting Items 1A, 7, and 7A from the 10-K is very helpful. If you agree, perhaps you and Carlos Soler could review, generalize, and comment on the Python code so that it can be published to be used by any member of our community.

bottom of page