how to remove repeated files from google drive in python

34

def main(temporary_workspace, workspace):
    store = file.Storage('tokenRead.json')
    big_list_of_file_ids = []

    creds = store.get()
    if not creds or creds.invalid:
        flow = client.flow_from_clientsecrets('credentials.json', SCOPES)
        creds = tools.run_flow(flow, store)
    service = build('drive', 'v3', http=creds.authorize(Http()))

    # Call the Drive v3 API
    results = service.files().list(
        q="'MAIN_FOLDER_WITH_SUBFOLDERS_ID' in parents",
        pageSize=1000, fields="nextPageToken, files(id, name)").execute()
    items = results.get('files', [])

    list_of_folders_and_ids = []
    if not items:
        raise RuntimeError('No files found.')
    else:
        for item in items:
            list_of_folders_and_ids.append((item['name'], item['id']))

    list_of_folders_and_ids.sort(key=lambda x: x[0])

    for folder_id in list_of_folders_and_ids:
        start_date = folder_id[0][:-3]
        id = folder_id[1]

        print('Folder: ', start_date, ', ID: ', id)

        query_string = "'{}' in parents".format(id)
        results = service.files().list(
            q=query_string, fields="nextPageToken, files(id, name)"
        ).execute()
        items = results.get('files', [])

        list_of_files_and_ids = []
        if not items:
            raise RuntimeError('No files found.')
        else:
            for item in items:
                list_of_files_and_ids.append((item['name'], item['id']))

        for file_id in list_of_files_and_ids:
            # Downloading the files
            if file_id[1] not in big_list_of_file_ids:
                big_list_of_file_ids.append(file_id[1])
            else:
                print('Duplicate file ID!')
                exit()

            print('\tFile: ', file_id[0], ', ID: ', file_id[1])

            request = service.files().get_media(fileId=file_id[1])
            fh = io.BytesIO()
            downloader = MediaIoBaseDownload(fh, request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()
                print("Download: {}".format(int(status.progress() * 100)))

            fh.seek(0)

            temporary_location = os.path.join(tmp_workspace, file_id[0])
            with open(temporary_location, 'wb') as out:
                out.write(fh.read())

            fh.close()

        convert_all_netcdf(temporary_workspace, start_date, workspace, r'Qout_south_america_continental',
                           num_of_rivids=62317)

        os.system('rm -rf %s/*' % tmp_workspace)

Comments

Submit
0 Comments