Source Code to extract text from pdf documents.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>PDF to Text Extractor</title>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.4.120/pdf.min.js" integrity="sha512-ml/QKfG3+Yes6TwOzQb7aCNtJF4PUyha6R3w8pSTo/VJSywl7ZreYvvtUso7fKevpsI+pYVVwnu82YO0q3V6eg==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
<style>
h1{
width: 100%;
text-align: center;
}
.pdfwork,.afterupload{
display: flex;
align-items: center;
justify-content: center;
flex-direction:column;
width: 100%;
}
.pdfwork *{
margin-top: 5px;
}
.afterupload{
display: none;
}
.another{
display: none;
}
</style>
</head>
<body>
<h1>PDF To Text Extractor</h1>
<div class="pdfwork">
<button class="another" onclick="location.reload()">Extract Another PDF</button>
<span>Select PDF</span>
<input type="file" class="selectpdf">
<span>Password :</span>
<input type="password" class="pwd" placeholder='optional'>
<button class="upload">Upload</button>
<div class="afterupload">
<span>Select Page</span>
<select class="selectpage" onchange="afterProcess()"></select>
<a href="" class="download" download>Download Pdf Text</a>
<textarea class="pdftext"></textarea>
</div>
</div>
<script>
pdfjsLib.GlobalWorkerOptions.workerSrc="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.4.120/pdf.worker.min.js";
let pdfinput = document.querySelector(".selectpdf");
let pwd = document.querySelector(".pwd");
let upload = document.querySelector(".upload");
let afterupload = document.querySelector(".afterupload");
let select = document.querySelector("select");
let download = document.querySelector(".download");
let pdftext = document.querySelector(".pdftext");
upload.addEventListener('click',()=>{
let file = pdfinput.files[0];
if(file!=undefined&&file.type=="application/pdf"){
let fr = new FileReader();
fr.readAsDataURL(file)
fr.onload=()=>{
let res = fr.result;
if(pwd.value==""){
extractText(res,false)
}else{
extractText(res,true)
}
}
}else{
alert("select a valid pdf file")
}
})
let alltext = [];
async function extractText(url,pass) {
try{
let pdf;
if(pass){
pdf = await pdfjsLib.getDocument({url:url,password:pwd.value}).promise;
}else{
pdf = await pdfjsLib.getDocument(url).promise;
}
let pages = pdf.numPages;
for(let i=1;i<=pages;i++){
let page = await pdf.getPage(i)
let txt = await page.getTextContent();
let text = txt.items.map((s)=>s.str).join("");
alltext.push(text)
}
alltext.map((e,i)=>{
select.innerHTML+=`
<option value="${i+1}">${i+1}</option>
`;
})
afterProcess()
}catch(err){
alert(err.message)
}
}
function afterProcess(){
pdftext.value=alltext[select.value-1];
download.href="data:text/plain;charset=utf-8,"+encodeURIComponent(alltext[select.value-1])
afterupload.style.display="flex";
document.querySelector(".another").style.display="unset";
}
</script>
</body>
</html>
No comments:
Post a Comment