Skip to content

Text overlapping on addRedactAnnot and apply_redactions #1062

@MallRoy

Description

@MallRoy

I am facing a text overlapping issue on applying redactions in a pdf.
I am using python 3.9.2 and pip 20.2.3 versions.
Image before applying redaction
image

Image After applying redaction
image

I am using the below code for redaction

# imports
import fitz
import re
import sys
import json
from matplotlib import colors


class Redactor:
    # constructor
    def __init__(self, path, desc, out, vertexes, color):

        self.path = path
        self.desc = desc.split(',')
        self.out = out
        self.vertexes = vertexes
        self.color = color

    def redaction(self):

        # opening the pdf
        doc = fitz.open(self.path)
        # iterating through pages
        for page in doc:

            if not page._isWrapped:
                page.wrapContents()

            text = page.getText('text')
            currentpage = page.number
            dl = page.getDisplayList()
            tp = dl.getTextPage()
            # print(self.vertexes[currentpage])
            for searchtext in self.desc or []:
                # print(searchtext)
                areas = tp.search(searchtext, quads=True)
                # print(areas)
                if (areas is not None):
                    [page.addRedactAnnot(area, fill=colors.to_rgb(
                        self.color)) for area in areas]
                    page.apply_redactions()
                    # print(areas)

            for vertices in self.vertexes[currentpage]["vertices"]:
                # print(vertices)
                clip = fitz.Quad(((vertices[0]["x"], vertices[0]["y"]), (vertices[1]["x"], vertices[1]["y"]), (
                    vertices[2]["x"], vertices[2]["y"]), (vertices[3]["x"], vertices[3]["y"])))
                page.addRedactAnnot(clip, fill=colors.to_rgb(self.color))
                page.apply_redactions()

        # saving it to a new pdf
        doc.save(self.out, deflate=True, clean=True, linear=True)
        print("Successfully redacted")


# driver code for testing
if __name__ == "__main__":

    # replace it with name of the pdf file
    path = sys.argv[1]
    desc = sys.argv[2]
    out = sys.argv[3]
    color = sys.argv[4]
    lines = sys.stdin.readlines()
    gcpvalues = json.loads(lines[0])
    vertexes = gcpvalues["vertex"]
    redactor = Redactor(path, desc, out, vertexes, color)
    redactor.redaction()

Metadata

Metadata

Assignees

Labels

postponepostpone to a future versionupstream bugbug outside this package

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions