scrape pdf out of link

32

import requests
from bs4 import BeautifulSoup
import io
from PyPDF2 import PdfFileReader
 
 
url = "https://www.geeksforgeeks.org/how-to-extract-pdf-tables-in-python/"
read = requests.get(url)
html_content = read.content
soup = BeautifulSoup(html_content, "html.parser")
 
list_of_pdf = set()
l = soup.find('p')
p = l.find_all('a')
 
for link in (p):
    pdf_link = (link.get('href')[:-5]) + ".pdf"
    print(pdf_link)
    list_of_pdf.add(pdf_link)
 
def info(pdf_path):
    response = requests.get(pdf_path)
     
    with io.BytesIO(response.content) as f:
        pdf = PdfFileReader(f)
        information = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()
 
    txt = f"""
    Information about {pdf_path}:
 
    Author: {information.author}
    Creator: {information.creator}
    Producer: {information.producer}
    Subject: {information.subject}
    Title: {information.title}
    Number of pages: {number_of_pages}
    """
    print(txt)
    return information
 
 
for i in list_of_pdf:
    info(i)

Comments

Submit
0 Comments