!pip install sec-api

from sec_api import ExtractorApiextractorApi = ExtractorApi("YOUR_API_KEY")# 10-K exampleurl_10k = "https://www.sec.gov/Archives/edgar/data/1318605/000156459021004599/tsla-10k_20201231.htm"# extract Item 1.A Risk Factors from 10-K filing in text formatitem_1A_text = extractorApi.get_section(url_10k, "1A", "text")# extract Item 7 "MD&A" from 10-K filing in html formatitem_7_html = extractorApi.get_section(url_10k, "7", "html")# 10-Q exampleurl_10q = "https://www.sec.gov/Archives/edgar/data/1318605/000095017022006034/tsla-20220331.htm"# extract Part II Item 1A Risk Factors from 10-Q filing in text formatpart2_item_1A_text = extractorApi.get_section(url_10q, "part2item1a", "text")# extract Part II Item 7 "MD&A" from 10-Q filing in html formatpart2_item_7_html = extractorApi.get_section(url_10q, "part2item7", "html")# 8-K exampleurl_8k = "https://www.sec.gov/Archives/edgar/data/66600/000149315222016468/form8-k.htm"# extract Item 1.01 Entry into a Material Definitive Agreement from 8-K filing in text formatitem_1_1_text = extractorApi.get_section(url_8k, "1-1", "text")# extract Item 4.01 Changes in Registrant's Certifying Accountant from 8-K filing in html formatitem_1_1_html = extractorApi.get_section(url_8k, "4-1", "html")

item_ids_10K = [    "1", "1A", "1B", "1C", "2", "3", "4",    "5", "6", "7", "7A", "8", "9", "9A", "9B",    "10", "11", "12", "13", "14", "15"]

item_ids_10Q = [    # Part 1    "part1item1", "part1item2", "part1item3", "part1item4",    # Part 2    "part2item1", "part2item1a", "part2item2", "part2item3",    "part2item4", "part2item5", "part2item6"]

item_ids_8K = [    # Item 1.x    "1-1", "1-2", "1-3", "1-4", "1-5",    # Item 2.x    "2-1", "2-2", "2-3", "2-4", "2-5", "2-6",    # Item 3.    "3-1", "3-2", "3-3",    # Item 4.x    "4-1", "4-2",    # Item 5.x    "5-1", "5-2", "5-3", "5-4", "5-5", "5-6", "5-7", "5-8",    # Item 6.x    "6-1", "6-2", "6-3", "6-4", "6-5", "6-6", "6-10",    # Item 7.x    "7-1",    # Item 8.x    "8-1",    # Item 9.x    "9-1",    # Miscellaneous    "signature"]

!pip install sec-api

from sec_api import ExtractorApiextractorApi = ExtractorApi("YOUR_API_KEY")

# Tesla 10-K filingfiling_url = "https://www.sec.gov/Archives/edgar/data/1318605/000156459021004599/tsla-10k_20201231.htm"# get the standardized and cleaned text of section 1A "Risk Factors"section_text = extractorApi.get_section(filing_url, "1A", "text")# get the original HTML of section 7 # "Management’s Discussion and Analysis of Financial Condition and Results of Operations"section_html = extractorApi.get_section(filing_url, "7", "html")

print("Tesla 10-K Risk Factors Section:")print("--------------------------------")print(section_text[:1000] + '...')

Tesla 10-K Risk Factors Section:-------------------------------- ITEM 1A. RISK FACTORSYou should carefully consider the risks described below together with the other information set forth in this report, which could materially affect our business, financial condition and future results. The risks described below are not the only risks facing our company. Risks and uncertainties not currently known to us or that we currently deem to be immaterial also may materially adversely affect our business, financial condition and operating results. Risks Related to Our Ability to Grow Our BusinessWe may be impacted by macroeconomic conditions resulting from the global COVID-19 pandemic.Since the first quarter of 2020, there has been a worldwide impact from the COVID-19 pandemic. Government regulations and shifting social behaviors have limited or closed non-essential transportation, government functions, business activities and person-to-person interactions. In some cases, the relaxation of such trends has recently been followed by actual or contempla...

from IPython.display import display, HTMLdisplay(HTML("<div><table><tr><td>" + section_html[0:2034]))

# txt version of Tesla's 10-K filingfiling_url = "https://www.sec.gov/Archives/edgar/data/1318605/000156459021004599/0001564590-21-004599.txt"section_text = extractorApi.get_section(filing_url, "1A", "text")

from sec_api import ExtractorApiextractorApi = ExtractorApi("YOUR_API_KEY")# Tesla 10-Q filingfiling_url = "https://www.sec.gov/Archives/edgar/data/1318605/000095017022006034/tsla-20220331.htm"# extract section 1A "Risk Factors" in part 2 as cleaned textsection_text = extractorApi.get_section(filing_url, "part2item1a", "text")print('Tesla 10-Q filing section 1A "Risk Factors" in part 2 as cleaned text:')print('---------------------------------------------------------------------')print(section_text[:1000] + '...')

Tesla 10-Q filing section 1A "Risk Factors" in part 2 as cleaned text:--------------------------------------------------------------------- ITEM 1A.RISK FACTORS You should carefully consider the risks described below together with the other information set forth in this report, which could materially affect our business, financial condition and future results. The risks described below are not the only risks facing our company. Risks and uncertainties not currently known to us or that we currently deem to be immaterial also may materially adversely affect our business, financial condition and operating results. Risks Related to Our Ability to Grow Our Business We may be impacted by macroeconomic conditions resulting from the global COVID-19 pandemic. Since the first quarter of 2020, there has been a worldwide impact from the COVID-19 pandemic. Government regulations and shifting social behaviors have limited or closed non-essential transportation, government functions, business activities and person-to-person interactions. In some cases, the relaxation of such trends has been followed by actual or contemplated ret...

from sec_api import ExtractorApiextractorApi = ExtractorApi("YOUR_API_KEY")filing_url = "https://www.sec.gov/Archives/edgar/data/66600/000149315222016468/form8-k.htm"# extract section 1.01 "Entry into Material Definitive Agreement" as cleaned textsection_text = extractorApi.get_section(filing_url, "1-1", "text")print("Section 1.01 text:")print("------------------")print(section_text)

Section 1.01 text:------------------ Item 1.01 Entry into a Material Definitive Agreement. &#160; Quad M Solutions, Inc., an Idaho corporation, (the &#8220;Company&#8221; or &#8220;Quad M&#8221;), is a public holding company that offers staffing services and employee benefits, such as health plans, HR-human resources, and payroll services, to small and mid-sized group employers. The Company is filing this Current Report on Form 8-K to disclose recent material events, including the Company&#8217;s entry into a material agreements, through its wholly-owned subsidiary Physicians HealthCare Services LLC (&#8220;PHCS&#8221;), with Advent Health, a Florida-based clinically-integrated network that contracts with health care providers to provide certain Covered Services to Members (&#8220;Advent Health Participating Providers&#8221;) and has the ability to sign Payor contracts with Advent Health Participating Providers. &#160; Through PHCS, the Company now has immediate access to approximately 10,000 employee/workers at the 2,000+ physician offices operated by Advent Health. These employees will be immediately eligible for health coverage through the self-funded plans operated by Quad M&#8217;s subsidiaries, Nuaxess and OpenAxess. &#160; The Advent Health project was approved recently by the Company&#8217;s Board of Directors. Advent Health shares Quad M&#8217;s vision to form a strategic care program that seeks to provide quality, cost-effective Covered Services to persons enrolled in Nuaxess and OpenAxess. &#160; The Agreements between the Company and PHCS and Advent are attached hereto as Exhibit 10.13 and 10.14, respectively. &#160;

import htmltext = html.unescape(section_text)print("Section 1.01 text after unescaping HTML character entities:")print("-----------------------------------------------------------")print(text.strip())

Section 1.01 text after unescaping HTML character entities:-----------------------------------------------------------Item 1.01 Entry into a Material Definitive Agreement.   Quad M Solutions, Inc., an Idaho corporation, (the “Company” or “Quad M”), is a public holding company that offers staffing services and employee benefits, such as health plans, HR-human resources, and payroll services, to small and mid-sized group employers. The Company is filing this Current Report on Form 8-K to disclose recent material events, including the Company’s entry into a material agreements, through its wholly-owned subsidiary Physicians HealthCare Services LLC (“PHCS”), with Advent Health, a Florida-based clinically-integrated network that contracts with health care providers to provide certain Covered Services to Members (“Advent Health Participating Providers”) and has the ability to sign Payor contracts with Advent Health Participating Providers.   Through PHCS, the Company now has immediate access to approximately 10,000 employee/workers at the 2,000+ physician offices operated by Advent Health. These employees will be immediately eligible for health coverage through the self-funded plans operated by Quad M’s subsidiaries, Nuaxess and OpenAxess.   The Advent Health project was approved recently by the Company’s Board of Directors. Advent Health shares Quad M’s vision to form a strategic care program that seeks to provide quality, cost-effective Covered Services to persons enrolled in Nuaxess and OpenAxess.   The Agreements between the Company and PHCS and Advent are attached hereto as Exhibit 10.13 and 10.14, respectively.

from sec_api import ExtractorApiimport multiprocessingextractorApi = ExtractorApi("YOUR_API_KEY")# number of processes to run in parallel.# each process will extract all items from a 10-K filing# if you have a large number of URLs, you may want to increase this number# to speed up the extraction process.number_of_processes = 2urls_10k = [    "https://www.sec.gov/Archives/edgar/data/815094/000156459019020329/abmd-10k_20190331.htm",    "https://www.sec.gov/Archives/edgar/data/789019/000156459019027952/msft-10k_20190630.htm",    # add more URLs of 10-K filings here]

def extract_items_10k(filing_url):    items_10_K = [            "1", "1A", "1B", "2", "3",            "4", "5", "6", "7", "7A",            "8", "9A", "9B", "10", "11",            "12", "13", "14"        ]    for item in items_10_K:        print(f"Extracting item {item} from 10-K filing {filing_url}")        try:            section_text = extractorApi.get_section(                filing_url=filing_url, section=item, return_type="text"            )            # Process section_text as needed: save to disk, store in a database, or perform analytics.            # IMPORTANT: Avoid holding a large number of sections in memory by appending them to a list,            # as this can lead to out-of-memory issues. Instead, ensure that memory is freed regularly            # by allowing garbage collection to manage unused objects.        except Exception as e:            print(e)

if __name__ == "__main__":    with multiprocessing.Pool(number_of_processes) as pool:        pool.map(extract_items_10k, urls_10k)

    import re    # Example: Removing newline characters and HTML entities    clean_text = re.sub(r"\n|&#\d+;", "", extracted_text)

# text with new line characters "\n" and HTML entities "&#160;", "&#8221;"extracted_section = (    "Item 1.01 Entry into a Material Definitive Agreement."    + " \n\n&#160; \n\nQuad M Solutions, Inc., an Idaho corporation, "    + "(the &#8220;Company&#8221; or &#8220;Quad M&#8221;),")

# the output of extracted_section includes "\n" and the HTML entities.# "\n" is not actually converted into a new line here. we need to print()# the string first to make Python convert "\n" into a line break.extracted_section

'Item 1.01 Entry into a Material Definitive Agreement. \n\n&#160; \n\nQuad M Solutions, Inc., an Idaho corporation, (the &#8220;Company&#8221; or &#8220;Quad M&#8221;),'

# we don't see "\n" in the printed version anymore# because the printer replaced "\n" with an actual line breakprint(extracted_section)

Item 1.01 Entry into a Material Definitive Agreement. &#160; Quad M Solutions, Inc., an Idaho corporation, (the &#8220;Company&#8221; or &#8220;Quad M&#8221;),

# we use a regular expression to substitute new line characters and HTML entities# with an empty string ""import recleaned_section = re.sub(r"\n|&#[0-9]+;", "", extracted_section)# "\n" and HTML entities are now removedcleaned_section

'Item 1.01 Entry into a Material Definitive Agreement.  Quad M Solutions, Inc., an Idaho corporation, (the Company or Quad M),'

print(cleaned_section)

Item 1.01 Entry into a Material Definitive Agreement.  Quad M Solutions, Inc., an Idaho corporation, (the Company or Quad M),

# let's decode all HTML entities to their UTF-8 equivalents# line breaks "\n" are keptimport htmlimport unicodedata# all HTML entities are converted into human-readable charactersdecoded_section = html.unescape(extracted_section)# convert "\xa0" and "\u201d" into their UTF-8 equivalentsdecoded_section = unicodedata.normalize("NFKC", decoded_section)decoded_section

'Item 1.01 Entry into a Material Definitive Agreement. \n\n  \n\nQuad M Solutions, Inc., an Idaho corporation, (the “Company” or “Quad M”),'

print(decoded_section)

Item 1.01 Entry into a Material Definitive Agreement.   Quad M Solutions, Inc., an Idaho corporation, (the “Company” or “Quad M”),

Extract Section Items From SEC Filings With Python

Quick Start

Item IDs by Form Type

10-K Filings - Item IDs

10-Q Filings - Item IDs for Part 1 and Part 2

8-K Filings - Item IDs

Extract Item Sections from 10-K Filings

Extract Item Sections from 10-Q Filings

Extract Item Sections from 8-K Filings

Extract and Download Sections from 10-K Filings Over Multiple Years

Cleaning Extracted Sections: Removing Newline Characters and Decoding HTML Entities

Choosing an Approach

Option 1 - Replacing Newline Characters and HTML Entities

Option 2 - Convert HTML Entities to UTF-8 Characters