Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Copying Images from document #1457

Open
rbp9802 opened this issue Dec 21, 2024 · 1 comment
Open

Copying Images from document #1457

rbp9802 opened this issue Dec 21, 2024 · 1 comment

Comments

@rbp9802
Copy link

rbp9802 commented Dec 21, 2024

Hello everyone!
I'm trying to copy the contenct of a document to another document but I failed to copy the image relations.

So:

  1. copy_body(source_doc, target_doc) copy de body from source_doc to target_doc. Relationships are not adjusted.
  2. copy_relations(source_doc, target_doc) add relationships present in source_doc to target_doc and save de new rId's as a dict (old_rId --> new rId)
  3. After this, I look for images copied in target_file by for pic in element.xpath('.//a:blip') and change rId present in pic.attrib[qn('r:embed')] to new_rId
  4. Pictures are not available to show when opening the saved document

image

import os
from io import BytesIO
from docx import Document
from copy import deepcopy
from docx.oxml.ns import qn

def copy_body(source_doc, target_doc):
    for element in source_doc.element.body.iterchildren():
        copied_element = deepcopy(element)
        target_doc.element.body.append(copied_element)
        
def copy_relations(source_doc, target_doc):
    new_rels = {}
    for rId in sorted(source_doc.part.rels):
        rel = source_doc.part.rels[rId]
        if "image" in rel.target_ref:
            # Get image binary data
            img_part = rel.target_part
            img_data = img_part._blob
            
            # Create new image_part from binary data
            my_bytesio = BytesIO(img_data)
            image_part = target_doc.part.package.get_or_add_image_part(my_bytesio)
            
            # Create new relationship
            new_rId = target_doc.part.relate_to(image_part, 'image')
            new_rel = target_doc.part.rels[new_rId]
            print(f'Image copied: {rId} {rel.target_ref} {my_bytesio.getbuffer().nbytes/1024:.3f}kB " + \
                  f"--> {new_rId} {new_rel.target_ref} {BytesIO(image_part._blob).getbuffer().nbytes/1024:.3f}kB')
            new_rels[rId] = new_rId
    return new_rels

folder = os.path.dirname(__file__)
source_file = 'doc_2_copy.docx'
target_file = 'src_doc.docx'
final_file = 'new_doc.docx'

source_doc = Document(os.path.join(folder, source_file))
target_doc = Document(os.path.join(folder, target_file))

copy_body(source_doc, target_doc)  # Copy body
new_rels = copy_relations(source_doc, target_doc)  # Copy relations and save rId's in new_rels

# Modify relations rId
for element in target_doc.element.body.iterchildren():
    # Look for images in new doc body
    for pic in element.xpath('.//a:blip'):
        if qn('r:embed') in pic.attrib:
            rId = pic.attrib[qn('r:embed')]  # Getting old rId
            new_rId = new_rels[rId]
            print(f"Image rID : {rId}")
            if rId != new_rId:
                print(f'  Modifying rId: {rId} -> {new_rId}')
                pic.attrib[utils.qn('r:embed')] = new_rId  # Setting new rId

target_doc.save(final_file)

Running the script prints in terminal:

Image copied: rId5 media/image1.png 28.188kB --> rId6 media/image1.png 28.188kB
Image copied: rId6 media/image2.png 24.028kB --> rId7 media/image2.png 24.028kB
Image rID: rId5
  Modifying rId: rId5 -> rId6
Image rID: rId6
  Modifying rId: rId6 -> rId7

I don't know what I'm missing, help please!

@rbp9802
Copy link
Author

rbp9802 commented Dec 23, 2024

UPDATE:

I manage to track picture location in document delete them and re-add as new picture. This is a much longer way than spected but ir works.

Anyway I appreciate any comments on the original issue.

Thanks!

The process was:

  1. Extract original pictures from source document and save them in the device
  2. Copy source document body via deepcopy
  3. Find new pictures in target document, delete and re-add them as new picture.
import os
from docx import Document
from copy import deepcopy

from docx.text.paragraph import Paragraph
from docx.text.run import Run

from docx.oxml.ns import qn
from lxml import etree

def copy_body(source_doc, target_doc, next_to_element=None):
    # If no next_to_element add next to last body element
    if next_to_element is None:
        next_to_element = target_doc.element.body[-1]
        
    for element in source_doc.element.body[::-1]:
        copied_element = deepcopy(element)
        next_to_element.addnext(copied_element)

def check_images(doc):
    added_images = []
    for element in doc.element.body.iterchildren():
        for pic in element.xpath('.//a:blip'):
            if qn('r:embed') in pic.attrib:
                # Function to check if picture exist in source document here... 
                # (same rId could already exist in target document)

                # Get relation Id
                rId = pic.attrib[qn('r:embed')]
                
                # Get run that contains picture
                r = get_parent_run(pic) 
                p = r.getparent()
                paragraph = Paragraph(p, doc)
                run = Run(r, paragraph)
                
                # Get picture size
                drawing = run._element.find(qn('w:drawing'))
                cx = drawing.find(qn('wp:inline')).find(qn('wp:extent')).attrib['cx']
                cy = drawing.find(qn('wp:inline')).find(qn('wp:extent')).attrib['cy']
                
                # Remove old drawing
                run._element.remove(drawing)
                
                # Add picture to new run
                inline_shape = run.add_picture(os.path.join(output_folder, rId + '.png'))
                inline_shape.width = int(cx)
                inline_shape.height = int(cy)
                
                added_images.append(inline_shape)
    return added_images
                
def get_parent_run(element):
    while element.tag != qn('w:r'):
        element = element.getparent()
    return element

def extract_images(doc, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for rId, rel in doc.part.rels.items():
        if "image" in rel.target_ref:
            img_part = rel.target_part
            img_data = img_part._blob
            extension = os.path.splitext(rel.target_ref)[1] or ".png"
            output_path = os.path.join(output_folder, f"{rId}{extension}")
            with open(output_path, "wb") as f:
                f.write(img_data)
                print(f"Imagen extraída: {output_path}")

# Example
folder = os.path.dirname(__file__)
source_file = 'document_to_copy.docx'
target_file = 'document_base.docx'
output_file = 'document_copy.docx'

output_folder = 'extracted_images'

# Open documentrs
source_doc = Document(os.path.join(folder, source_file))
target_doc = Document(os.path.join(folder, target_file))

# Extract source_doc images
extract_images(source_doc, output_folder)

# Copy body
after_paragraph = target_doc.paragraphs[0]
copy_body(source_doc, target_doc, next_to_element=after_paragraph._element)

# Check images
added_images = check_images(target_doc)

target_doc.save(output_file)
print(f"Document saved as: {output_file}")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant