From b6ee9094a8a029ae601b54d32a1aedcc0750e1b3 Mon Sep 17 00:00:00 2001
From: cmodevcodes <Chunhui.Mo@yahoo.com>
Date: Sat, 11 Apr 2026 14:25:04 -0400
Subject: [PATCH] Add new project

---
 Biosimilars_Finder/README.md           |  48 +++++++
 Biosimilars_Finder/biosimilars.py      | 190 +++++++++++++++++++++++++
 Biosimilars_Finder/requirements.txt    |   5 +
 Biosimilars_Finder/test_biosimilars.py | 103 ++++++++++++++
 4 files changed, 346 insertions(+)
 create mode 100644 Biosimilars_Finder/README.md
 create mode 100644 Biosimilars_Finder/biosimilars.py
 create mode 100644 Biosimilars_Finder/requirements.txt
 create mode 100644 Biosimilars_Finder/test_biosimilars.py

diff --git a/Biosimilars_Finder/README.md b/Biosimilars_Finder/README.md
new file mode 100644
index 0000000..80ea0ca
--- /dev/null
+++ b/Biosimilars_Finder/README.md
@@ -0,0 +1,48 @@
+# Biosimilars Finder
+#### Description:
+
+Query FDA open API database as well as the most recent csv file posted on FDA's
+Purple book website for available biologics and biosimilars information
+
+Class Drug is used to store particular information of a drug and available biosimilars info"
+
+get_brand() queries the FDA API for the openfda data associated with a drug and store info into the drug instance
+
+As the FDA API doesn't take RE query or provide detailed functionalities for drug brand name check, rf"^{name}(?![\w-])" is used to do a second round check if a drug is a valid drug given the FDA database. This helps to filter out cases like "name-xxxx" or "XX name" or "xxxnamexxx"
+
+    Parameters:
+        drug (Class Drug): Drug class instance that is used to store FDA query results
+        name (str): User input drug name to be checked with FDA database
+
+    Raises:
+        KeyError: If a drug's brand name exist FDA's database but info such as generic/molecule name, route, or moa is not available (e.g. Humira)
+        ValueError : if the FDA API cannot be accessed
+
+get_biologics () finds the most recent PurpleBook CSV file and identify if a drug is biologics and if it has biosimilar. Retrieve biosimilars info and store in the Drug class instance if available"
+
+It automatically checks 24 months starting from the current month to find the most recent Purple Book csv. The function should be called only after get_brand () is called.
+
+It determines if a drug is biologics based on PurpleBook's cvs field - Proprietary Name.
+
+It determines if a drug is biologics based on PurpleBook's cvs field - Ref. Product Proprietary Name
+
+Class Drug property self._biosimilars stores all fields of the FDA PurpleBook csv file of a drug's biosimilars as Pandas DataFrame, not just the ones that prn_biosim() prints
+
+    Parameters:
+        drug (Class Drug): Drug class instance that is used to store FDA query results
+        name (str): User input drug name to be checked with the PurpleBook csv fil
+
+
+prn_biosim() print a Drug instance's selected biosimilars information. Class Drug property self._biosimilars stores all fields of the FDA PurpleBook csv file of a drug's biosimilars as Pandas DataFrame, not just the ones that prn_biosim() prints so more info can be easily added to the output table if needed
+
+    Parameters:
+        drug (Class Drug): Drug class instance that is used to store FDA query results
+
+    ValueError : if the drug is not biologics or if it doesn't have biosimilars info
+
+
+
+TODO
+1. The three key functions - get_brand(), get_biologics(), prn_biosim() can be easily changed to instance method as they were designed as method for the class Drug
+2. One can sometime access a PurpleBook CSV when it is not even officially posted on FDA's PurpleBook website. For example, in March, 2026. The most updated file should be February, 2026 based on PurpleBook's website but March, 2026 csv can already be accessed. This may/may not be the intended behavior of the program
+3. The reason that the program has to automatically download the PurpleBook csv, hold it in memory for query etc is because FDA doesn't currently have an API for PurpleBook. It would be great to change the PurpleBook query to an APL query once an API is available
diff --git a/Biosimilars_Finder/biosimilars.py b/Biosimilars_Finder/biosimilars.py
new file mode 100644
index 0000000..7a78f1f
--- /dev/null
+++ b/Biosimilars_Finder/biosimilars.py
@@ -0,0 +1,190 @@
+import requests
+import pandas as pnds
+import re
+import time
+from tabulate import tabulate
+from datetime import datetime
+from dateutil.relativedelta import relativedelta
+from io import StringIO
+
+
+def main () :
+
+    """
+    Query FDA open API database as well as the most recent csv data file on FDA's
+    Purple book website for available biologics and biosimilars information
+
+    """
+
+    name= input("Brand Name: ").strip()
+    drug=Drug(name)
+    get_brand(drug,name)
+    if drug.is_drug :
+        get_biologics(drug,name)
+    print (drug)
+
+    if drug.is_biologics and drug.has_biosimilar:
+        biosim= input ("Do you want more biosimilars info?[Y/N] ").lower().strip()
+        if biosim == "y" or biosim == "yes" :
+            prn_biosim(drug)
+
+
+class Drug:
+
+    """
+    Class Drug is used to store particular information of a drug and available biosimilars info"
+    """
+
+    def __init__(self, brand_name=[]):
+        self.brand_name=brand_name
+        self.is_drug = False
+        self._generic_name=""
+        self._route=""
+        self._moa=""
+        self.is_biologics= False
+        self.has_biosimilar = False
+        self._biosimilars = []
+
+    def __str__(self):
+        if not self.is_drug :
+            s= f"{self.brand_name} is not a brand drug based on the FDA database"
+        else :
+            s= f"\nBrand Name: {self.brand_name}\n"
+            s += f"Molecule Name: {self._generic_name}\n"
+            s += f"Route: {self._route}\n"
+            s += f"Mechnism of Action: {self._moa}\n"
+            s += "Biologics: Yes\n" if self.is_biologics else "Biologics: N/A\n"
+            s += "Biosimilars: Yes" if self.has_biosimilar else "Biosimilars: N/A"
+        return s
+
+
+def get_brand (drug,name):
+
+    """
+    Query the FDA API for the openfda data associated with a drug and store info into the drug instance
+
+    Parameters:
+        drug (Class Drug): Drug class instance that is used to store FDA query results
+        name (str): User input drug name to be checked with FDA database
+
+    Raises:
+        KeyError: If a drug's brand name exist FDA's database but info such as generic/molecule
+          name, route, or moa is not available (e.g. Humira)
+        ValueError : if the FDA API cannot be accessed
+    """
+
+    url= f'https://api.fda.gov/drug/label.json?search=openfda.brand_name:"{name}"&limit=1'
+
+    try :
+        r = requests.get(url, timeout=10)  
+    except requests.exceptions.RequestException as e:
+        print("FDA API not avaialble")
+
+    if r.status_code ==200 :
+        response = r.json()  
+        results = response.get("results", [])
+        openfda = results[0].get("openfda",{})
+
+        fda_name = openfda["brand_name"][0].strip().capitalize ()
+        pattern=rf"^{name}(?![\w-])"
+
+        if match :=re.search(pattern,fda_name,re.I) :
+            drug.brand_name = fda_name.capitalize()
+            drug.is_drug=True
+
+            try :
+                drug._generic_name=openfda["generic_name"][0].capitalize()
+            except KeyError :
+                drug._generic_name ="N/A"
+            try :
+                drug._route=openfda["route"][0].capitalize()
+            except KeyError :
+                drug._route ="N/A"
+            try :
+                drug._moa=openfda["pharm_class_moa"][0].capitalize()[:-6] # removing " [moa]" at the end of the return string
+            except KeyError :
+                drug._moa ="N/A"
+        else :
+                drug.is_drug=False
+    else :
+        drug.is_drug=False
+
+
+def get_biologics (drug,name):
+
+    """
+        Finds the most recent PurpleBook CSV file and identify if a drug is biologics
+        and if it has biosimilar. Retrieve biosimilars info and store in the Drug class
+        instance if available"
+
+        Automatically check 24 months starting from the current month to find the most
+        recent Purple Book csv
+
+    Parameters:
+        drug (Class Drug): Drug class instance that is used to store FDA query results
+        name (str): User input drug name to be checked with the PurpleBook csv file
+    """
+
+    now = datetime.now()
+    # number of months back from current to search purplebook data
+    num_months= 24
+    base_url = "https://www.accessdata.fda.gov/drugsatfda_docs/PurpleBook"
+    headers = {"user-Agent": "Mozilla/5.0"}
+
+    for i in range (num_months) :
+        past = now - relativedelta(months=i)
+        month_name = past.strftime("%B").capitalize()
+        year=str(past.year)
+        filename = f"purplebook-search-{month_name}-data-download.csv"
+        url = f"{base_url}/{year}/{filename}"
+        pb_read_success = False
+ #      
+        try:
+            r= requests.get(url, headers=headers)
+            if r.status_code ==200 :
+                pb_read_success = True
+                pb = pnds.read_csv(StringIO(r.text), skiprows=3)
+                break
+            else :
+                time.sleep(2)
+        except Exception as e:
+            time.sleep(2)
+
+    if pb_read_success :
+        biologics_matches = pb[pb["Proprietary Name"].str.contains(name, case=False, na=False)]
+        drug.is_biologics = not biologics_matches.empty
+
+        if drug.is_biologics :
+            biosimilars_matches = pb[pb["Ref. Product Proprietary Name"].str.contains(name, case=False, na=False)]
+            drug.has_biosimilar = not biosimilars_matches.empty
+            if drug.has_biosimilar :
+                    drug._biosimilars=biosimilars_matches
+
+
+def prn_biosim(drug) :
+
+    """
+    Print a Drug instance's selected biosimilars information
+
+    Parameters:
+        drug (Class Drug): Drug class instance that is used to store FDA query results
+
+    ValueError : if the drug is not biologics or if it doesn't have biosimilars info
+
+    """
+
+    if drug.has_biosimilar == True and drug.is_biologics == True :
+        print(tabulate(
+            drug._biosimilars[['Proprietary Name','Proper Name','Strength','Applicant','Approval Date']],
+            headers=['Brand Name','Molecule Name','Strength','Applicant','Approval Date'],
+            tablefmt="grid",
+            showindex=False,
+            maxcolwidths=[None,None,None,15,None]
+        ))
+    else :
+        raise ValueError ("The drug does not have biosimlar")
+
+
+if __name__ == "__main__" :
+    main ()
+
diff --git a/Biosimilars_Finder/requirements.txt b/Biosimilars_Finder/requirements.txt
new file mode 100644
index 0000000..67890c1
--- /dev/null
+++ b/Biosimilars_Finder/requirements.txt
@@ -0,0 +1,5 @@
+requests
+pandas
+python-dateutil
+tabulate
+pytest
diff --git a/Biosimilars_Finder/test_biosimilars.py b/Biosimilars_Finder/test_biosimilars.py
new file mode 100644
index 0000000..28c21a7
--- /dev/null
+++ b/Biosimilars_Finder/test_biosimilars.py
@@ -0,0 +1,103 @@
+import pytest
+import pandas as pnds
+from biosimilars import Drug, get_brand, get_biologics, prn_biosim
+
+
+def test_init () :
+    drug = Drug ("remicade")
+    assert drug.brand_name == "remicade"
+    assert drug.is_drug == False
+    assert drug.is_biologics == False
+    assert drug.has_biosimilar == False
+    with pytest.raises(TypeError) :
+        jar= Drug ("remicade", "rituximab")
+
+
+def test_get_brand () :
+    drug = Drug ()
+    get_brand (drug, "rituxan")
+    assert drug.is_drug == True
+    assert drug._generic_name == "Rituximab and hyaluronidase"
+
+    drug = Drug ()
+    get_brand (drug, "remicade")
+    assert drug.is_drug == True
+
+    drug = Drug ()
+    get_brand (drug, "HumiRa")
+    assert drug.is_drug == True
+
+    drug = Drug ()
+    get_brand (drug, "KeyTruda")
+    assert drug.is_drug == True
+
+    drug = Drug ()
+    get_brand (drug, "r")
+    assert drug.is_drug == False
+
+    drug = Drug ()
+    get_brand (drug, "re")
+    assert drug.is_drug == False
+
+    drug = Drug ()
+    get_brand (drug, "Not a Drug")
+    assert drug.is_drug == False
+
+    drug = Drug ()
+    get_brand (drug, "")
+    assert drug.is_drug == False
+
+    with pytest.raises(TypeError) :
+       get_brand (drug)
+    with pytest.raises(TypeError) :
+       get_brand (drug,"xxx","XXX")
+
+
+def test_get_biologics () :
+    drug = Drug ()
+    get_brand (drug, "rituxan")
+    get_biologics (drug, "rituxan")
+    assert drug.is_biologics == True
+    assert drug.has_biosimilar == True
+
+    drug = Drug ()
+    get_brand (drug, "HuMira")
+    get_biologics (drug, "Humira")
+    assert drug.is_biologics == True
+    assert drug.has_biosimilar == True
+    assert isinstance(drug._biosimilars, pnds.DataFrame)
+
+    drug = Drug ()
+    get_brand (drug,"Remicade")
+    get_biologics (drug,"reMIcade")
+    assert drug.is_biologics == True
+    assert drug.has_biosimilar == True
+    assert isinstance(drug._biosimilars, pnds.DataFrame)
+
+    drug = Drug ()
+    get_brand (drug,"KeyTruda")
+    get_biologics (drug, "keytruda")
+    assert drug.is_biologics == True
+    assert drug.has_biosimilar == False
+    assert not isinstance(drug._biosimilars, pnds.DataFrame)
+
+    drug = Drug ()
+    get_brand (drug, "lipitor")
+    get_biologics (drug, "lipitor")
+    assert drug.is_biologics == False
+    assert drug.has_biosimilar == False
+    assert not isinstance(drug._biosimilars, pnds.DataFrame)
+
+    with pytest.raises(TypeError) :
+       get_biologics (drug)
+    with pytest.raises(TypeError) :
+       get_biologics (drug,"xxx","XXX")
+
+def test_prn_biosim() :
+    with pytest.raises(TypeError) :
+        prn_biosim()
+    drug = Drug ()
+    get_brand (drug, "lipitor")
+    get_biologics (drug, "lipitor")
+    with pytest.raises(ValueError) :
+        prn_biosim(drug)