The below Databricks Notebook named "Workbooks.py" uses a metadata dictionary (control_metadata) to dynamically create multiple worksheets within two Excel workbooks—
workbook_staff and workbook_member. Each workbook is saved to a specified .xlsx
file path and securely uploaded to both Azure Blob Storage and SharePoint Online. All worksheets
are password-protected, and data rows are programmatically hidden to prevent casual access or modification.
Issue:
When the spreadsheets are reopened via SharePoint, the previously hidden rows become visible again,
and worksheet protection appears to be lost—allowing users to view and potentially edit the contents.
workbooks.py:-
import os
import json
import logging
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession, col
from datetime import datetime
import xlsxwriter
from azure.storage.blob import BlobServiceClient
from azure.identity import ClientSecretCredential
from office365.sharepoint.client_context import ClientContext
from io import BytesIO
# Setup logging
log = logging.getLogger("Pipeline-bcp_plus_critical_member_data")
log.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] - %(message)s"))
log.addHandler(handler)
log.info("Starting Pipeline bcp_plus_critical_member_data")
# Load JSON config
config_path = "Workspace/Shared/bcp/config.json"
with open(config_path, encoding="utf-8") as f:
config = json.load(f)
# Read config values
file_path = config["file_path"]
staff_file_name = config["staff_file_name"]
member_file_name = config["member_file_name"]
output_container_name = config["output_container_name"]
output_blob_staff = config["output_blob_staff"]
output_blob_member = config["output_blob_member"]
output_storage_connection_string = config["output_storage_connection_string"]
sp_file_path_staff = config["SharePointPathStaff"]
sp_file_path_member = config["SharePointPathMember"]
url = config["url"]
client_id = config["client_id"]
# Spark + Secrets
spark = SparkSession.builder.getOrCreate()
## Use correct secret key for workbook password
workbook_protection_password = dbutils.secrets.get("mdp-secrets", key="workbook-password")
# Retrieve managed identity credential
uc_credential = "bac-mdp-dev-mel"
managedidentity_credential = dbutils.credentials.getServiceCredentialsProvider(uc_credential)
# Blob Service
output_blob_service_client = BlobServiceClient(
account_url=output_storage_connection_string,
credential=managedidentity_credential
)
xlsx_path_staff = os.path.join(file_path, staff_file_name)
xlsx_path_member = os.path.join(file_path, member_file_name)
# -------------------------------------------------------------------
## Combine file‐open encryption + workbook structure protection
# -------------------------------------------------------------------
workbook_staff = xlsxwriter.Workbook(
xlsx_path_staff,
{
'constant_memory': True, ## required for encryption to persist
'strings_to_numbers': True,
'password': workbook_protection_password, ## file‐open AES encryption
'workbook_properties': { ## lock workbook structure
'protection': {
'password': workbook_protection_password,
'lock_structure': True
}
}
}
)
workbook_member = xlsxwriter.Workbook(
xlsx_path_member,
{
'constant_memory': True, # >>> CHANGED
'strings_to_numbers': True,
'password': workbook_protection_password, # >>> CHANGED
'workbook_properties': { # >>> CHANGED
'protection': {
'password': workbook_protection_password,
'lock_structure': True
}
}
}
)
# Cell sanitizer
def sanitize(val):
if isinstance(val, np.ndarray):
return json.dumps(val.tolist())
if isinstance(val, np.generic):
return val.item()
if isinstance(val, pd.Timestamp):
return val.to_pydatetime()
if pd.isna(val):
return None
if isinstance(val, (dict, list)):
return json.dumps(val)
return val
# control metadata
control_metadata = [
{"worksheet_name": "MmeberAccount1", "rows_per_worksheet": 1000000,
"extract_script": "select * from dev.TableMmeberAccount", "file_type": "Staff"},
{"worksheet_name": "MmeberAccount2", "rows_per_worksheet": 1000000,
"extract_script": "select * from dev.TableMmeberAccount", "file_type": "Staff"},
{"worksheet_name": "MmeberAccount3", "rows_per_worksheet": 1000000,
"extract_script": "select * from dev.TableMmeberAccount", "file_type": "member"},
{"worksheet_name": "MmeberAccount4", "rows_per_worksheet": 1000000,
"extract_script": "select * from dev.TableMmeberAccount", "file_type": "member"}
]
metadata_rows = control_metadata
for row in metadata_rows:
ws_name = (row["worksheet_name"] or "Sheet")[:25]
script = row["extract_script"]
max_rows = int(row["rows_per_worksheet"])
file_type = row["file_type"].strip().lower()
# pick the right workbook
workbook = workbook_staff if file_type == "staff" else workbook_member
try:
log.info(f"Processing {file_type} worksheet: {ws_name}")
df = spark.sql(script)
# cast timestamps to string
for field in df.schema.fields:
if field.dataType.simpleString().startswith("timestamp"):
df = df.withColumn(field.name, col(field.name).cast("string"))
pdf = df.toPandas()
total_rows = len(pdf)
parts = (total_rows // max_rows) + (1 if total_rows % max_rows else 0)
log.info(f"Pandas shape: {pdf.shape}, splitting into {parts} part(s)")
for part in range(parts):
start, end = part * max_rows, min((part + 1) * max_rows, total_rows)
sheet_name = f"{ws_name}_{part+1}"[:31]
sheet = workbook.add_worksheet(sheet_name)
# Write headers
for ci, cname in enumerate(pdf.columns):
sheet.write(0, ci, cname)
# Write data rows
for ri, row_vals in enumerate(pdf.iloc[start:end].values, start=1):
for ci, cell in enumerate(row_vals):
sheet.write(ri, ci, sanitize(cell))
sheet.set_row(ri, None, None, {'hidden': True})
## Protect each sheet so rows stay hidden & cells locked
sheet.protect(password=workbook_protection_password)
except Exception as e:
log.error(f"Error in worksheet {ws_name}: {e}", exc_info=True)
continue
# Close workbooks to apply all encryption & protection
workbook_staff.close()
workbook_member.close()
# Upload to ADLS
for fname, xpath, blob_name in [
(staff_file_name, xlsx_path_staff, output_blob_staff + staff_file_name),
(member_file_name, xlsx_path_member, output_blob_member + member_file_name)
]:
blob_client = output_blob_service_client.get_blob_client(
container=output_container_name,
blob=blob_name
)
with open(xpath, "rb") as f:
blob_client.upload_blob(f, overwrite=True)
log.info(f"Uploaded {fname} to {output_container_name}/{blob_name}")
# SharePoint Upload
client_secret = dbutils.secrets.get("mdp-secrets", key="sp-client-secret")
tenant_id = "yyyyy"
sp_cred = ClientSecretCredential(tenant_id, client_id, client_secret)
ctx = ClientContext(url).with_credentials(sp_cred)
ctx.web.get().execute_query()
folder_staff = ctx.web.get_folder_by_server_relative_url(sp_file_path_staff)
folder_member = ctx.web.get_folder_by_server_relative_url(sp_file_path_member)
for fname, blob_name, folder in [
(staff_file_name, output_blob_staff, folder_staff),
(member_file_name, output_blob_member, folder_member),
]:
blob_client = output_blob_service_client.get_blob_client(
container=output_container_name,
blob=blob_name
)
data = blob_client.download_blob().readall()
buffer = BytesIO(data)
upload_file = folder.files.add(fname, buffer.getvalue(), overwrite=True).execute_query()
log.info(f"Uploaded {fname} to SharePoint at: {upload_file.serverRelativeUrl}")
... Show more