Search notes:

Python library: unstructured

from unstructured.partition.auto import partition

import requests
import os

if not os.path.exists('LLM.html'):

   res = requests.get('https://en.wikipedia.org/wiki/Large_language_model')

   if res.status_code != 200:
       print(f'Failed to download the HTML file. Status Code: {res.status_code}')
       os.exit(1)


   with open('LLM.html', 'w', encoding='utf-8') as f:
        f.write(res.text)


elems = partition(filename='LLM.html')

print(f'Text has {len(elems)} elements')

for i, elem in enumerate(elems):
    print(f'{i:>4} {elem}')

Index