find - Method Usage

Bad OCR in a board of education annual financial report

This PDF is all sorts of information about the Board of Education in Liberty County, Georgia

pdf.find(text="FINANCIAL HIGHLIGHTS").show()

View full example →

page = pdf.find(text="FINANCIAL HIGHLIGHTS").page
page.show()

View full example →

Complex Extraction of Law Enforcement Complaints

This PDF contains a set of complaint records from a local law enforcement agency. Challenges include its relational data structure, unusual formatting common in the region, and redactions that disrupt automatic parsing.

pdf.add_exclusion(lambda page: page.find(text='L.E.A. Data Technologies').below(include_source=True))
pdf.add_exclusion(lambda page: page.find(text='Complaints By Date').above(include_source=True))

page.show(exclusions='black')

View full example →

complainant = (
  section
  .find("text:contains(Complainant)")
  .right(until='text')
)
print("Complainant is", complainant.extract_text())

View full example →

dob = (
  section
  .find("text:contains(DOB)")
  .right(until='text')
)
print("DOB is", dob.extract_text())

View full example →

number = (
    section
    .find("text:contains(Number)")
    .below(until='text', width='element')
)
print("Number is", number.extract_text())

View full example →

number = (
    section
    .find("text:contains(Number)")
    .below(until='text', width='element')
    .find('text', overlap='partial')
)

View full example →

    section
    .find("text:contains(Number)")
    .below(until='text', width='element')
    .find('text', overlap='partial')
)
print("Number is", number.extract_text())
number.show(crop=100)

View full example →

(
  section
  .find('text:contains(Date Assigned)')
  .below(width='element')
  .show(crop=100)
)

View full example →

(
  section
  .find('text:contains(Date Assigned)')
  .below(width='element')
  .find('text')
  .extract_text()

View full example →

  section
  .find('text:contains(Date Assigned)')
  .below(width='element')
  .find('text')
  .extract_text()
)

View full example →

complainant = (
  section
  .find("text:contains(Complainant)")
  .right(until='text')
)
dob = (

View full example →

)
dob = (
  section
  .find("text:contains(DOB)")
  .right(until='text')
)
address = (

View full example →

)
address = (
  section
  .find("text:contains(Address)")
  .right(until='text')
)
gender = (

View full example →

)
gender = (
  section
  .find("text:contains(Gender)")
  .right(until='text')
)
phone = (

View full example →

)
phone = (
  section
  .find("text:contains(H Phone)")
  .right(until='text')
)
date_assigned = (

View full example →

)
date_assigned = (
  section
  .find('text:contains(Date Assigned)')
  .below(width='element')
  .find('text')
)

View full example →

  section
  .find('text:contains(Date Assigned)')
  .below(width='element')
  .find('text')
)
completed = (
  section

View full example →

)
completed = (
  section
  .find('text:contains(Completed)')
  .below(width='element')
  .find('text')
)

View full example →

  section
  .find('text:contains(Completed)')
  .below(width='element')
  .find('text')
)
recorded = (
  section

View full example →

)
recorded = (
  section
  .find('text:contains(Recorded)')
  .below(until='text', width='element')
)

View full example →

rows = []
for section in sections:
    complainant = section.find("text:contains(Complainant)").right(until='text')
    dob = section.find("text:contains(DOB)").right(until='text')
    address = section.find("text:contains(Address)").right(until='text')
    gender = section.find("text:contains(Gender)").right(until='text')

View full example →

rows = []
for section in sections:
    complainant = section.find("text:contains(Complainant)").right(until='text')
    dob = section.find("text:contains(DOB)").right(until='text')
    address = section.find("text:contains(Address)").right(until='text')
    gender = section.find("text:contains(Gender)").right(until='text')
    phone = section.find("text:contains(H Phone)").right(until='text')

View full example →

for section in sections:
    complainant = section.find("text:contains(Complainant)").right(until='text')
    dob = section.find("text:contains(DOB)").right(until='text')
    address = section.find("text:contains(Address)").right(until='text')
    gender = section.find("text:contains(Gender)").right(until='text')
    phone = section.find("text:contains(H Phone)").right(until='text')
    investigator = (

View full example →

    complainant = section.find("text:contains(Complainant)").right(until='text')
    dob = section.find("text:contains(DOB)").right(until='text')
    address = section.find("text:contains(Address)").right(until='text')
    gender = section.find("text:contains(Gender)").right(until='text')
    phone = section.find("text:contains(H Phone)").right(until='text')
    investigator = (
        section

View full example →

    dob = section.find("text:contains(DOB)").right(until='text')
    address = section.find("text:contains(Address)").right(until='text')
    gender = section.find("text:contains(Gender)").right(until='text')
    phone = section.find("text:contains(H Phone)").right(until='text')
    investigator = (
        section
        .find("text:contains(Investigator)")

View full example →

    phone = section.find("text:contains(H Phone)").right(until='text')
    investigator = (
        section
        .find("text:contains(Investigator)")
        .below(until='text', width='element')
        .find('text', overlap='partial')
    )

View full example →

        section
        .find("text:contains(Investigator)")
        .below(until='text', width='element')
        .find('text', overlap='partial')
    )
    number = (
        section

View full example →

    )
    number = (
        section
        .find("text:contains(Number)")
        .below(until='text', width='element')
        .find('text', overlap='partial')
    )

View full example →

        section
        .find("text:contains(Number)")
        .below(until='text', width='element')
        .find('text', overlap='partial')
    )
    date_assigned = (
      section

View full example →

    )
    date_assigned = (
      section
      .find('text:contains(Date Assigned)')
      .below(width='element')
      .expand(left=5, right=5)
      .find('text')

View full example →

      .find('text:contains(Date Assigned)')
      .below(width='element')
      .expand(left=5, right=5)
      .find('text')
    )
    completed = (
      section

View full example →

    )
    completed = (
      section
      .find('text:contains(Completed)')
      .below(width='element')
      .expand(left=5, right=5)
      .find('text')

View full example →

      .find('text:contains(Completed)')
      .below(width='element')
      .expand(left=5, right=5)
      .find('text')
    )
    recorded = (
      section

View full example →

    )
    recorded = (
      section
      .find('text:contains(Recorded)')
      .below(until='text', width='element')
      .expand(left=5, right=5)
    )

View full example →

    # Grab the case number
    case_number = (
        section
        .find("text:contains(Number)")
        .below(until='text', width='element')
        .find('text', overlap='partial')
        .extract_text()

View full example →

        section
        .find("text:contains(Number)")
        .below(until='text', width='element')
        .find('text', overlap='partial')
        .extract_text()
    )

View full example →

Extracting Business Insurance Details from BOP PDF

This PDF is a complex insurance policy document generated for small businesses requiring BOP coverage. It contains an overwhelming amount of information across 111 pages. Challenges include varied forms that may differ slightly between carriers, making extraction inconsistent. It has to deal with different templated layouts, meaning even standard parts can shift when generated by different software.

(
    page
    .find(text="POLICY NUMBER")
    .right(until='text')
    .show()
)

View full example →

(
    page
    .find(text="POLICY NUMBER")
    .right(until='text')
    .extract_text()
)

View full example →

(
    page
    .find(text="Mailing Address")
    .expand(bottom='text')
    .show()
)

View full example →

(
    page
    .find(text="Mailing Address")
    .expand(bottom='text')
    .right()
    .extract_text()

View full example →

page = pdf.find(text="SERVICE OF SUIT").page
page.show()

View full example →

Extracting Complex Data from Serbian Regulatory PDF

This PDF contains parts of Serbian policy documents, crucial for a research project analyzing industry policies across countries. The challenge lies in extracting a large table that spans pages (page 90 to 97) and a math formula on page 98, all in Serbian. Both elements lack clear boundaries between pages, complicating extraction.

first_page = pdf.find(text="Prilog 7.").page
last_page = pdf.find(text='VISINA NAKNADE ZA ZAGAĐENJE VODA').page
pages = pdf.pages[first_page.index:last_page.index+1]
pages.show(cols=4)

View full example →

region = (
    pages
    .find(text="Tabela 4")
    .below(
        until="text:contains(Tabela 5)",
        include_endpoint=False,

View full example →

page = pdf.find(text="Obračun naknade za neposredno zagađenje voda").page
page.find("image").show()

View full example →

Extracting Data Tables from Oklahoma Booze Licensees PDF

This PDF contains detailed tables listing alcohol licensees in Oklahoma. It has multi-line cells making it hard to extract data accurately. Challenges include alternative row colors instead of lines ("zebra stripes"), complicating row differentiation and extraction.

header = page.find(text="PREMISE").above()
footer = page.find("text:regex(Page \d+ of)")
(header + footer).show()

View full example →

print("Before exclusions:", page.extract_text()[:200])

# Add exclusions
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)").expand())

print("After exclusions:", page.extract_text()[:200])

View full example →


# Add exclusions
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)").expand())

print("After exclusions:", page.extract_text()[:200])

View full example →

region = (
    page
    .find(text="NUMBER")
    .right(include_source=True)
)
region.show(crop=100)

View full example →

headers = (
    page
    .find(text="NUMBER")
    .right(include_source=True)
    .expand(top=3, bottom=3)
    .find_all('text')

View full example →

(
    page
    .find(text="NUMBER")
    .below(width='element')
).show(crop=100, width=700)

View full example →

rows = (
    page
    .find(text="NUMBER")
    .below(
      width='element',
      include_source=True

View full example →

Extracting Economic Data from Brazil's Central Bank PDF

This PDF is the weekly “Focus” report from Brazil’s central bank with economic projections and statistics. Challenges include commas instead of decimal points, images showing projection changes, and tables without border lines that merge during extraction.

data = (
    page
    .find(text='Expectativas')
    .below(
        until='text:contains(comportamento)',
        include_endpoint=False

View full example →

row_names = (
    data
    .find(text='IPCA')
    .below(width='element', include_source=True)
    .clip(data)
    .find_all('text', overlap='partial')

View full example →

        .to_df(header=False)
        .dropna(axis=0, how='all')
        .assign(
            year=section.find('text[size~=10]:regex(\d\d\d\d)').extract_text(),
            value=headers
        )
    )

View full example →

(
    data
    .find('text:contains(2025)')
    .right(
        until='text:contains(2026)',
        include_source=True,

View full example →

table = (
    data
    .find('text:contains(2025)')
    .right(
        until='text:contains(2026)',
        include_source=True,

View full example →

table = (
    data
    .find('text:contains(2026)')
    .right(
        until='text:contains(2027)',
        include_source=True,

View full example →

table = (
    data
    .find('text:contains(2027)')
    .right(
        until='text:contains(2028)',
        include_source=True,

View full example →

table = (
    data
    .find('text:contains(2028)')
    .right(include_source=True)
    .below(width='element')
    .expand(top=-20)

View full example →

Extracting State Agency Call Center Wait Times from FOIA PDF

This PDF contains data on wait times at a state agency call center. The main focus is on the data on the first two pages, which matches other states' submission formats. The later pages provide granular breakdowns over several years. Challenges include it being heavily pixelated, making it hard to read numbers and text, with inconsistent and unreadable charts.

table_area = (
    page
    .find('text:contains(Figure)')
    .below(
        until='text:contains(Please use the comments)',
        include_endpoint=False

View full example →

ICE Detention Facilities Compliance Report Extraction

This PDF is an ICE report on compliance among detention facilities over the last 20-30 years. Our aim is to extract facility statuses and contract signatories' names and dates. Challenges include strange redactions, blobby text, poor contrast, and ineffective OCR. It has handwritten signatures and dates that are redacted.

with left_col.within() as col:
    portion = (
        left_col
        .find("text:closest(Name and Location)")
        .below(
          until='text:contains(ICE Information)',
          include_endpoint=False

View full example →

label = (
    left_col
    .find("text:closest(Dates of Review)")
)
print("Found", label.extract_text())
label.show(crop=20)

View full example →

(
  left_col
  .find("text:closest(County)")
  .show(crop=50)
)

View full example →

with left_col.within() as col:
    label = left_col.find("text:closest(County)")
    answer = label.below(until='text')
    print(answer.extract_text('words'))

View full example →

with left_col.within() as col:
    label = left_col.find("text:closest(Previous Rating)")
    answer = label.below(until='text')
checkbox_region = answer.expand(5).trim()
checkbox_region.show(crop=True)

View full example →

  return region.left(20).expand(top=3)

# overlap='partial' because the OCR might be a few pixels off
acceptable = checkbox_region.find(text='Acceptable', overlap='partial')
deficient = checkbox_region.find(text='Deficient', overlap='partial')
at_risk = checkbox_region.find(text='At-Risk', overlap='partial')

View full example →


# overlap='partial' because the OCR might be a few pixels off
acceptable = checkbox_region.find(text='Acceptable', overlap='partial')
deficient = checkbox_region.find(text='Deficient', overlap='partial')
at_risk = checkbox_region.find(text='At-Risk', overlap='partial')

region1 = get_checkbox(acceptable)

View full example →

# overlap='partial' because the OCR might be a few pixels off
acceptable = checkbox_region.find(text='Acceptable', overlap='partial')
deficient = checkbox_region.find(text='Deficient', overlap='partial')
at_risk = checkbox_region.find(text='At-Risk', overlap='partial')

region1 = get_checkbox(acceptable)
region2 = get_checkbox(deficient)

View full example →

judge.add(get_checkbox(page.find(text='Field Office')))
judge.add(get_checkbox(page.find(text='HQ Review')))
judge.add(get_checkbox(page.find('text[text=Court Order]')))
judge.add(get_checkbox(page.find('text[text=Major Litigation]')))

View full example →

judge.add(get_checkbox(page.find(text='Field Office')))
judge.add(get_checkbox(page.find(text='HQ Review')))
judge.add(get_checkbox(page.find('text[text=Court Order]')))
judge.add(get_checkbox(page.find('text[text=Major Litigation]')))
judge.add(get_checkbox(page.find('text[text=Class Action Order]')))

View full example →

judge.add(get_checkbox(page.find(text='Field Office')))
judge.add(get_checkbox(page.find(text='HQ Review')))
judge.add(get_checkbox(page.find('text[text=Court Order]')))
judge.add(get_checkbox(page.find('text[text=Major Litigation]')))
judge.add(get_checkbox(page.find('text[text=Class Action Order]')))
judge.add(get_checkbox(page.find('text[text=No]')))

View full example →

judge.add(get_checkbox(page.find(text='HQ Review')))
judge.add(get_checkbox(page.find('text[text=Court Order]')))
judge.add(get_checkbox(page.find('text[text=Major Litigation]')))
judge.add(get_checkbox(page.find('text[text=Class Action Order]')))
judge.add(get_checkbox(page.find('text[text=No]')))

View full example →

judge.add(get_checkbox(page.find('text[text=Court Order]')))
judge.add(get_checkbox(page.find('text[text=Major Litigation]')))
judge.add(get_checkbox(page.find('text[text=Class Action Order]')))
judge.add(get_checkbox(page.find('text[text=No]')))

View full example →

Natural PDF basics with text and tables

Learn the fundamentals of Natural PDF - opening PDFs, extracting text with layout preservation, selecting elements by criteria, spatial navigation, and managing exclusion zones. Perfect starting point for PDF data extraction.

page.find('rect').show()

View full example →

text = page.find('rect').extract_text()
print(text)

View full example →

# Find red text
red_text = page.find('text[color~=red]')
print(red_text.extract_text())

View full example →

# Find text starting with specific string
text = page.find('text:contains("INS-")')
print(text.extract_text())

View full example →

# Extract text to the right of "Date:"
date = page.find(text="Date:").right(height='element')
date.show()

View full example →

OCR and AI magic

Master OCR techniques with Natural PDF - from basic text recognition to advanced LLM-powered corrections. Learn to extract text from image-based PDFs, handle tables without proper boundaries, and leverage AI for accuracy improvements.

table_area = (
    page
    .find('text:contains(Violations)')
    .below(
        until='text:contains(Jungle)',
        include_endpoint=False

View full example →

Working with page structure

Extract text from complex multi-column layouts while maintaining proper reading order. Learn techniques for handling academic papers, newsletters, and documents with intricate column structures using Natural PDF's layout detection features.

region = (
    flow
    .find('text:contains("Table one")')
    .below(
        until='text:contains("Table two")',
        include_endpoint=False

View full example →

page.find('table').apply_ocr()
text = page.extract_text()
print(text)

View full example →

page.find('table').show()

View full example →

data = page.find('table').extract_table()
data

View full example →

table_area = page.find("region[type=table]")
table_area.apply_ocr()

View full example →