```python
import pandas as pd
import numpy as np
import mermaid as md
from mermaid.graph import Graph
import subprocess
def deduplicate(df, cols):
input_file_name = os.path.join(csv_dir, './' + input('Enter file name: ') + '.csv')
df = pd.read_csv(input_file_name) # A (records)
cols_input = input('Enter the columns for which to deduplicate based on: ')
cols = [c.strip() for c in cols_input.split(',')]
output_file_name = os.path.join(csv_dir, './' + '_'.join(cols) + '_deduplicated.csv')
prisma_file_name = output_file_name.replace('.csv', '.mmd')
nulls_mask = df[cols].isnull().any(axis=1)
df_nulls = df[nulls_mask] # B
df_non_nulls = df[~nulls_mask] # C
duplicates_mask = df_non_nulls.duplicated(subset = cols, keep = False)
df_non_duplicates = df_non_nulls[~duplicates_mask] # D
df_duplicates = df_non_nulls[duplicates_mask] # E
df_kept = df_duplicates.drop_duplicates(subset = cols, keep = 'first')
df_removed = df_duplicates[~df_duplicates.index.isin(df_kept.index)]
df_unique = pd.concat([df_kept, df_non_duplicates])
df_deduplicated = pd.concat([df_unique, df_nulls], ignore_index=True) # df of unique + df of nulls
results = {
'records': len(df),
'nulls': len(df_nulls),
'non_nulls': len(df_non_nulls),
'non_duplicates': len(df_non_duplicates),
'duplicates': len(df_duplicates),
'removed': len(df_removed),
'kept': len(df_kept),
'unique': len(df_unique),
'deduplicated': len(df_deduplicated)
}
df_nulls.to_csv(output_file_name.replace('deduplicated','nulls'), index = False)
df_deduplicated.to_csv(output_file_name, index = False)
df_removed.to_csv(output_file_name.replace('.csv', '_removed.csv'), index = False)
return results, df_nulls, df_deduplicated, df_kept, df_removed, output_file_name, prisma_file_name
if __name__ == '__main__':
results, df_nulls, df_deduplicated, df_kept, df_removed, output_file_name, prisma_file_name = deduplicate(df=None, cols=None)
graph_text = f"""---
config:
theme: neutral
curve: stepBefore
---
graph TD;
A["**records** (*n* = {results['records']})"];
B["null (*n* = {results['nulls']})'];
C["non-null (*n* = {results['non_nulls']})"];
D["non-duplicates (*n* = {results['non_duplicates']})"];
E["duplicates (*n* = {results['duplicates']})"];
F["duplicates kept (*n* = {results['kept']})"];
G["duplicates removed (*n* = {results['removed']})"];
H["unique (*n* = {results['unique']})"];
I["deduplicated (*n* = {results['deduplicated']})"];
A --> B & C;
C --> D & E;
E --> F & G;
D & F --> H;
B & H --> I"""
with open(prisma_file_name, 'w') as f:
f.write(graph_text)
subprocess.run(['mmdc', '-i', prisma_file_name, '-o ', prisma_file_name.replace('mmd', 'svg')], shell = True)
```
**flowchart** ^flowchart
```mermaid
graph TD;
A["records (*n* = 1,309)"];
B["null (*n* = 238)"];
C["non-null (*n* = 1,071)"];
D["non-duplicates (*n* = 169)"];
E["duplicates (*n* = 902)"];
F["duplicates kept (*n* = 303)"];
G["duplicates removed <br>(*n* = 599)"];
H["unique (*n* = 472)"];
I["DOI deduplicated (*n* = 710)"];
J["null (*n* = 123)"];
K["non-null (*n* = 587)"];
L["non-duplicates (*n* = 454)"];
M["duplicates (*n* = 133)"];
N["duplicates kept (*n* = 55)"];
O["duplicates removed<br> (*n* = 78)"];
P["unique (*n* = 509)"];
Q["Title+Author+Year<br>deduplicated (*n* = 632)"];
R["null (*n* = 0)"];
S["non-null (*n* = 629)"];
T["non-duplicates (*n* = 549)"];
U["duplicates (*n* = 80)"];
V["duplicates kept (*n* = 36)"];
W["duplicates removed (*n* = 44)"];
X["unique (*n* = 585)"];
Y["Records screened (*n* = 585)"];
Z["Reports sought for retrieval (*n* = 476)"]
A --> B & C;
C --> D & E;
E --> F & G;
D & F --> H;
B & H --> I;
I --> J & K;
K --> L & M;
M --> N & O;
L & N --> P;
J & P --> Q;
Q --> R & S;
S --> T & U;
U --> V & W;
T & V --> X;
R & X --> Y;
Y --> Z
```