!pip install -q sec-api

API_KEY = 'YOUR_API_KEY'

from sec_api import QueryApiqueryApi = QueryApi(api_key=API_KEY)

query = {  "query": "formType:LETTER AND filedAt:[2021-01-01 TO 2021-12-31]",  "from": "0",  "size": "200",  "sort": [{ "filedAt": { "order": "desc" } }]}response = queryApi.get_filings(query)

print('Number of LETTER filings published in 2021:', response['total']['value'])

Number of LETTER filings published in 2021: 9245

import pandas as pdmetadata = pd.DataFrame.from_records(response['filings'])metadata.head(3)

def get_metadata(start_year=2021, end_year=2022):  form_type_filter  = 'formType:LETTER'  frames = []  for year in range(start_year, end_year + 1):    for month in range(1, 13):      download_counter = 0      padded_month = str(month).zfill(2) # "1" -> "01"      date_range_filter = f'filedAt:[{year}-{padded_month}-01 TO {year}-{padded_month}-31]'      lucene_query = date_range_filter + ' AND ' + form_type_filter      query_from = 0      query_size = 200      while True:        query = {          "query": lucene_query,          "from": query_from,          "size": query_size,          "sort": [{ "filedAt": { "order": "desc" } }]        }        response = queryApi.get_filings(query)        filings = response['filings']        if len(filings) == 0:          break        else:          query_from += query_size        metadata = list(map(lambda f: {'ticker': f['ticker'],                                       'cik': f['cik'],                                       'formType': f['formType'],                                       'filedAt': f['filedAt'],                                       'filingUrl': f['linkToFilingDetails'],                                       'linkToTxt': f['linkToTxt']                                      }, filings))        df = pd.DataFrame.from_records(metadata)        # remove all entries without a ticker symbol        df = df[df['ticker'].str.len() > 0]        df.loc[:, 'filingUrl'] = df['filingUrl'].apply(lambda url: url.replace('ix?doc=/', ''))        frames.append(df)        download_counter += len(df)      print(f'✅ Downloaded {download_counter} metadata objects for {year}-{padded_month}')  result = pd.concat(frames)  print(f'✅ Download completed. Metadata downloaded for {len(result)} filings.')  return result

metadata_2021 = get_metadata(start_year=2021, end_year=2021)

✅ Downloaded 548 metadata objects for 2021-01✅ Downloaded 590 metadata objects for 2021-02✅ Downloaded 685 metadata objects for 2021-03✅ Downloaded 604 metadata objects for 2021-04✅ Downloaded 584 metadata objects for 2021-05✅ Downloaded 666 metadata objects for 2021-06✅ Downloaded 618 metadata objects for 2021-07✅ Downloaded 600 metadata objects for 2021-08✅ Downloaded 678 metadata objects for 2021-09✅ Downloaded 586 metadata objects for 2021-10✅ Downloaded 540 metadata objects for 2021-11✅ Downloaded 512 metadata objects for 2021-12✅ Download completed. Metadata downloaded for 7211 filings.

metadata_2021.head(3)

!pip install -q pandarallel

from sec_api import RenderApirenderApi = RenderApi(api_key=API_KEY)

import re, os, timedef download_content(filing, retry_counter=0):  url      = filing['linkToTxt']  ticker   = filing['ticker']  formType = filing['formType']  try:    new_folder = './filings/' + ticker    date = filing['filedAt'][:10]    file_name = date + '_' + formType + '_' + url.split('/')[-1]    file_path = new_folder + '/' + file_name    if not os.path.isdir(new_folder):      os.makedirs(new_folder)    content = renderApi.get_filing(url)    pattern = r""    matches = re.findall(pattern, content, re.DOTALL)    if matches:      if '' in matches[0]:        extracted_text = matches[1]      else:        extracted_text = matches[0]      with open(file_path, 'w') as f:        f.write(extracted_text)      return extracted_text    else:      print("No text found between .")      return ''  except Exception as e:    print(f"❌ {ticker}: download failed: {url}")    print(f"Error: {str(e)}")    if retry_counter < 4:      time.sleep(2)      return download_content(filing, retry_counter=retry_counter+1)    else:      return ''filing = metadata_2021.iloc[140]content = download_content(filing)

print(content)

United States securities and exchange commission logo                             January 25, 2021       Mark A. Goldsmith, M.D., Ph.D.       President and Chief Executive Officer       Revolution Medicines, Inc.       700 Saginaw Drive       Redwood City, CA 94063                                                        Re: RevolutionMedicines, Inc.                                                            Draft RegistrationStatement on Form S-1                                                            Submitted January21, 2021                                                            CIK No. 0001628171       Dear Mr. Goldsmith:                                                        This is to advise youthat we do not intend to review your registration statement.               We request that you publicly file your registration statement nolater than 48 hours prior       to the requested effective date and time. Please refer to Rules 460 and461 regarding requests for       acceleration. We remind you that the company and its management areresponsible for the       accuracy and adequacy of their disclosures, notwithstanding any review,comments, action or       absence of action by the staff.                                                        Please contact DavidGessert at 202-551-2326 with any questions.                             Sincerely,                             Division of Corporation Finance                             Office of Life Sciences

from pandarallel import pandarallelnumber_of_workers = 10pandarallel.initialize(progress_bar=True, nb_workers=number_of_workers, verbose=0)

# download a simple of 500 comment letterssample = metadata_2021.head(500)sample['content'] = sample.parallel_apply(download_content, axis=1)# uncomment the line below to download all SEC comment letters# sample = metadata_2021# sample['content'] = metadata_2021.parallel_apply(download_content, axis=1)print('✅ Download complete')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=50), Label(value='0 / 50'))), HBox…

✅ Download complete

<ipython-input-15-8fc70936294f>:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.Try using .loc[row_indexer,col_indexer] = value insteadSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy  sample['content'] = sample.parallel_apply(download_content, axis=1)

def get_directory_size(directory):  total_size = 0  for path, dirs, files in os.walk(directory):    for file in files:      file_path = os.path.join(path, file)      total_size += os.path.getsize(file_path)  return total_size / (1024 * 1024)directory_path = './filings'total_size_mb = get_directory_size(directory_path)print(f"Total size of all downloaded SEC Comment letters: {total_size_mb:.2f} MB")

Total size of all downloaded SEC Comment letters: 2.02 MB

sample_content = sample['content'].values[50]print(sample_content[:800], '...')

United States securities and exchange commission logo                          January 28, 2021       Kyle Guse       Chief Financial Officer and General Counsel       ATOSSA THERAPEUTICS, INC.       107 Spring Street       Seattle, Washington 98104                                                        Re: ATOSSATHERAPEUTICS, INC.                                                            RegistrationStatement on Form S-3                                                            Filed January 22,2021                                                            File No. 333-252335       Dear Mr. Guse:              This is to advise you that we have not reviewed and will notreview your registration       statement.               Please refer to Rules 460 and 461  ...

def extract_form_type(text):  try:    header_text_pattern = r".*Dear"    header_text = re.findall(header_text_pattern, text,  re.I | re.S)    header_text = header_text[0]    pattern = r"(Statement .*|Form [^\s]+)"    matches = re.findall(pattern, header_text,  re.I)    matches = list(set(matches))    matches = [re.sub(r'Statement (on|off)\s?', '', x, flags=re.I) for x in matches]    return matches  except Exception as e:    print(e)    return []extract_form_type(sample_content)

['Form S-3']

# Blackberry letter refers to two form types: 10-K, 10-Qsample_content = sample['content'].values[100]print(sample_content[:800], '...')

United States securities and exchange commission logo                           January 26, 2021       Steve Rai       Chief Financial Officer       Blackberry Limited       2200 University Ave East       Waterloo, ON N2K 0A7,       Canada                                                        Re: Blackberry Ltd.                                                            Form 10-K forFiscal Year Ended February 29, 2020                                                            Filed April 7, 2020                                                            Form 10-Q forQuarterly Periods Ended November 30, 2020                                                            Filed September 24,2020                                                            File No. 001-38232 ...

print('Form types associated with the letter sent to Blackberry:')print(extract_form_type(sample_content))

Form types associated with the letter sent to Blackberry:['Form 10-Q', 'Form 10-K']

sample['refers_to'] = sample['content'].parallel_apply(extract_form_type)

df = sample['refers_to'].explode('refers_to')df.reset_index(drop=True, inplace=True)form_types = sorted(list(df.unique().astype(str)))print('EDGAR Form Types of SEC Comment Letters:')print(''.join(form_types))

EDGAR Form Types of SEC Comment Letters:Form 1-AForm 10Form 10-12GForm 10-KForm 10-K/AForm 10-QForm 20-FForm 8-KForm 8-K/AForm F-1Form F-3Form F-4Form S-1Form S-11Form S-3Form S-4Form SF-1From S-1From S-4Schedule 14ASchedule 14Cnan

import matplotlib.pyplot as pltdf = sample['refers_to'].explode('refers_to')df.reset_index(drop=True, inplace=True)value_counts = df.value_counts()plt.figure(figsize=(10, 6))value_counts.plot(kind='bar')plt.xlabel('Form Type')plt.ylabel('Number of SEC Comment Letters')plt.title('Number of SEC Comment Letters per EDGAR Form Type (2021)')plt.xticks(rotation=45)plt.show()

def remove_header(letter):  cleaned_letter = re.sub(r'.*Dear', '', letter, flags=re.S)  return cleaned_lettersample['cleaned_content'] = sample['content'].parallel_apply(remove_header)

print('Example of a comment letter without its header')print('----------------------------------------------')print(sample['cleaned_content'].values[0][:1500], '...')

Example of a comment letter without its header---------------------------------------------- Mr. Wasserman:              We have reviewed your registration statement and have thefollowing comments. In       some of our comments, we may ask you to provide us with information sowe may better       understand your disclosure.              Please respond to this letter by amending your registrationstatement and providing the       requested information. If you do not believe our comments apply to yourfacts and       circumstances or do not believe an amendment is appropriate, please tellus why in your       response.              After reviewing any amendment to your registration statement andthe information you       provide in response to these comments, we may have additional comments.       Registration Statement on From S-4 filed January 4, 2021       Selected Definitions, page 3   1.                                                   Please revise toinclude Seller Earnout Securities, Seller Earnout Share and Seller Earnout                                                        Unit in your selecteddefinitions.       Questions and Answers About this Business Combination, page 13   2.                                                   Please revise thissection to include a question that asks and answers the question, "What                                                        will Altimar AcquistionCorporation equity holders receive as a result of the business                                                        combination?" In th ...

!pip install -q nltk

import nltknltk.download('stopwords')nltk.download('omw-1.4')nltk.download('wordnet')wn = nltk.WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...[nltk_data]   Unzipping corpora/stopwords.zip.[nltk_data] Downloading package omw-1.4 to /root/nltk_data...[nltk_data] Downloading package wordnet to /root/nltk_data...

stopwords = nltk.corpus.stopwords.words('english')print(f'There are {len(stopwords)} default stopwords. They are {stopwords}')

There are 179 default stopwords. They are ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

# remove stop wordssample['content_standardized'] = sample['cleaned_content'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in stopwords]))# lemmatizesample['content_standardized'] = sample['content_standardized'].apply(lambda x: ' '.join([wn.lemmatize(w) for w in x.split() if w not in stopwords]))# remove other non-informative wordswordlist = ['please', 'page', 'comment']sample['content_standardized'] = sample['content_standardized'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in wordlist]))

print(sample['content_standardized'].values[0][:300])

Mr. Wasserman: reviewed registration statement following comments. comments, may ask provide u information may better understand disclosure. respond letter amending registration statement providing requested information. believe apply fact circumstance believe amendment appropriate, tell u response.

# docs = list(sample['cleaned_content'])docs = list(sample['content_standardized'])

import collectionsfrom tqdm import tqdmfrom sklearn.feature_extraction.text import CountVectorizer# extract vocab to be used in BERTopicvocab = collections.Counter()tokenizer = CountVectorizer().build_tokenizer()for doc in tqdm(docs):  vocab.update(tokenizer(doc))vocab = [word for word, frequency in vocab.items() if frequency >= 15]; len(vocab)

100%|██████████| 500/500 [00:00<00:00, 8673.34it/s]

1237

import localelocale.getpreferredencoding = lambda: 'UTF-8'

!pip install -q bertopicfrom bertopic import BERTopic

# source: https://colab.research.google.com/drive/1W7aEdDPxC29jP99GGZphUlqjMFFVKtBC?usp=sharing#scrollTo=DIv6fRecTvTwfrom sentence_transformers import SentenceTransformerfrom umap import UMAPfrom hdbscan import HDBSCAN# prepare sub-models# see here for a list of all models: https://www.sbert.net/docs/pretrained_models.htmlembedding_model = SentenceTransformer('all-mpnet-base-v2')embeddings = embedding_model.encode(docs, show_progress_bar=True)umap_model = UMAP(n_components=5, n_neighbors=15, random_state=42, metric='cosine', verbose=True)hdbscan_model = HDBSCAN(min_samples=20, gen_min_span_tree=True, prediction_data=False, min_cluster_size=20)# vectorizer_model = CountVectorizer(vocabulary=vocab, stop_words="english")# fit BERTopic without actually performing any clusteringtopic_model= BERTopic(        embedding_model=embedding_model,        umap_model=umap_model,        hdbscan_model=hdbscan_model,        # vectorizer_model=vectorizer_model, # causes divide by zero encountered in true_divide        verbose=True).fit(docs, embeddings=embeddings)

topic_model.get_topic_info()

topic_model.get_topic(topic=1)

[('review', 0.16467594236548108), ('action', 0.15724217716679395), ('absence', 0.10993984620559281), ('461', 0.10682466720551433), ('acceleration', 0.10671969930903165), ('request', 0.10343612706686907), ('staff', 0.10165338949306918), ('advise', 0.10045037768051748), ('finance', 0.10039777356376828), ('adequacy', 0.09678466838447443)]

# reduce dimensionality of embeddingsumap_model = UMAP(n_components=5, n_neighbors=15, random_state=42, metric='cosine', verbose=True)reduced_embeddings_2d = umap_model.fit_transform(embeddings)

UMAP(angular_rp_forest=True, metric='cosine', n_components=5, random_state=42, verbose=True)Wed Jun 14 14:38:46 2023 Construct fuzzy simplicial setWed Jun 14 14:38:46 2023 Finding Nearest NeighborsWed Jun 14 14:38:46 2023 Finished Nearest Neighbor SearchWed Jun 14 14:38:47 2023 Construct embedding

Epochs completed:   0%|            0/500 [00:00]

Wed Jun 14 14:38:48 2023 Finished embedding

import itertoolsimport pandas as pd# define colors for the visualization to iterate overcolors = itertools.cycle(['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231',                          '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe',                          '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000',                          '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080',                          '#ffffff', '#000000'])color_key = {str(topic): next(colors) for topic in set(topic_model.topics_) if topic != -1}# prepare dataframe and ignore outliersdf = pd.DataFrame({"x": reduced_embeddings_2d[:, 0], "y": reduced_embeddings_2d[:, 1], "Topic": [str(t) for t in topic_model.topics_]})df['Length'] = [len(doc) for doc in docs]df = df.loc[df.Topic != '-1']df = df.loc[(df.y > -10) & (df.y < 10) & (df.x < 10) & (df.x > -10), :]df['Topic'] = df['Topic'].astype('category')# get centroids of clustersmean_df = df.groupby('Topic').mean().reset_index()mean_df.Topic = mean_df.Topic.astype(int)mean_df = mean_df.sort_values('Topic')

!pip install -q adjustText

import seaborn as snsfrom matplotlib import pyplot as pltfrom adjustText import adjust_textimport matplotlib.patheffects as pefig = plt.figure(figsize=(10, 10))sns.scatterplot(data=df, x='x', y='y', c=df['Topic'].map(color_key), alpha=0.4, sizes=(0.4, 10), size="Length")# annotate top 50 topicstexts, xs, ys = [], [], []for row in mean_df.iterrows():  topic = row[1]["Topic"]  name = " - ".join(list(zip(*topic_model.get_topic(int(topic))))[0][:3])  if int(topic) <= 50:    xs.append(row[1]["x"])    ys.append(row[1]["y"])    texts.append(plt.text(row[1]["x"], row[1]["y"], name, size=10, ha="center", color=color_key[str(int(topic))],                          path_effects=[pe.withStroke(linewidth=0.5, foreground="black")]))# adjust annotations such that they do not overlapadjust_text(texts, x=xs, y=ys, time_lim=1, force_text=(0.01, 0.02), force_static=(0.01, 0.02), force_pull=(0.5, 0.5))plt.show()# plt.savefig("visualization2.png", dpi=600)

topic_model.visualize_topics()

topic_model.visualize_barchart()

topic_model.visualize_heatmap()

	id	accessionNo	cik	ticker	companyName	companyNameLong	formType	description	filedAt	linkToTxt	linkToHtml	linkToFilingDetails	entities	documentFormatFiles	dataFiles	seriesAndClassesContractsInformation
0	b0289576de3f821a2096b306176ee195	0000000000-21-015509	1801417	BYNO	byNordic Acquisition Corp	byNordic Acquisition Corp (Filed for)	LETTER	Form UPLOAD - SEC-generated letter	2021-12-31T07:30:12-05:00	https://www.sec.gov/Archives/edgar/data/180141...	https://www.sec.gov/Archives/edgar/data/180141...	https://www.sec.gov/Archives/edgar/data/180141...	[{'companyName': 'byNordic Acquisition Corp (F...	[{'sequence': '1', 'documentUrl': 'https://www...	[]	[]
1	7f986b3cd6a3583643094da271ae7dae	0000000000-21-015508	1737995	STSS	Sharps Technology Inc.	Sharps Technology Inc. (Filed for)	LETTER	Form UPLOAD - SEC-generated letter	2021-12-30T18:30:03-05:00	https://www.sec.gov/Archives/edgar/data/173799...	https://www.sec.gov/Archives/edgar/data/173799...	https://www.sec.gov/Archives/edgar/data/173799...	[{'companyName': 'Sharps Technology Inc. (File...	[{'sequence': '1', 'documentUrl': 'https://www...	[]	[]
2	a6d2e09a2504eb0e4f9c4e75bb0b044e	0000000000-21-015503	895464	YBGJ	Yubo International Biotech Ltd	Yubo International Biotech Ltd (Filed for)	LETTER	Form UPLOAD - SEC-generated letter	2021-12-30T16:30:04-05:00	https://www.sec.gov/Archives/edgar/data/895464...	https://www.sec.gov/Archives/edgar/data/895464...	https://www.sec.gov/Archives/edgar/data/895464...	[{'companyName': 'Yubo International Biotech L...	[{'sequence': '1', 'documentUrl': 'https://www...	[]	[]

	ticker	cik	formType	filedAt	filingUrl	linkToTxt
0	OWL	1823945	LETTER	2021-01-29T18:30:04-05:00	https://www.sec.gov/Archives/edgar/data/182394...	https://www.sec.gov/Archives/edgar/data/182394...
1	SIVR	1450922	LETTER	2021-01-29T17:30:06-05:00	https://www.sec.gov/Archives/edgar/data/145092...	https://www.sec.gov/Archives/edgar/data/145092...
2	IMCR	1671927	LETTER	2021-01-29T17:30:06-05:00	https://www.sec.gov/Archives/edgar/data/167192...	https://www.sec.gov/Archives/edgar/data/167192...

	Topic	Count	Name	Representation	Representative_Docs
0	0	339	0_statement_disclosure_may_registration	[statement, disclosure, may, registration, not...	[Mr. Balkin: reviewed registration statement f...
1	1	58	1_review_action_absence_461	[review, action, absence, 461, acceleration, r...	[Mr. Flood: advise reviewed review registratio...
2	2	57	2_review_action_sciences_life	[review, action, sciences, life, adequacy, acc...	[Mr. Schreiber: advise reviewed review registr...
3	3	46	3_review_action_completed_finance	[review, action, completed, finance, adequacy,...	[Mr. Beekhuisen: completed review filing. remi...

Download and Analyze SEC Comment Letters

Introduction to SEC Comment Letters

EDGAR Form Types of SEC Comment Letters

When are SEC Comment Letters released to the public?

Search and Find SEC Comment Letters

Find `UPLOAD`/`LETTER` Filings

Find `CORRESP` Filings

Example `LETTER` Filing

Example `CORRESP` Filing

Find SEC Comment Letters

Download SEC Comment Letters

Analyze SEC Comment Letters

Number of SEC Comment Letters per EDGAR Form Type

Topic Modelling of SEC Comment Letters with BERTopic

Vocabulary Generation

Train BERTopic

References

SEC.gov Resources

Papers

Topic Modelling

Other Resources