Извлеките текст из документа с помощью Docs API

Возможно, вам будет полезно извлечь из документа только текст. Это полезно, если вы передаете текст в другую службу API. Весь текст документа содержится на его вкладках в текстовых фрагментах элементов абзаца . Извлечение всего текста в документе включает в себя обход иерархии дерева вкладок и вызов методов получения из Tab и DocumentTab . Дополнительную информацию о функции вкладок см. в разделе Работа с вкладками .

Текст может появляться в трех типах структурных элементов вкладки документа:

  • Параграф
  • Оглавление
  • Таблицы

Таблицы могут быть вложены в другую таблицу. Следовательно, чтобы извлечь весь текст документа, необходимо посетить каждый вложенный структурный элемент.

Полное описание тела документа см. в руководстве «Структура документа» .

В следующем примере API Документов Google используется рекурсия для посещения каждого структурного элемента на всех вкладках документа и печати текста.

// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

import com.google.api.client.auth.oauth2.Credential;
import com.google.api.client.extensions.java6.auth.oauth2.AuthorizationCodeInstalledApp;
import com.google.api.client.extensions.jetty.auth.oauth2.LocalServerReceiver;
import com.google.api.client.googleapis.auth.oauth2.GoogleAuthorizationCodeFlow;
import com.google.api.client.googleapis.auth.oauth2.GoogleClientSecrets;
import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport;
import com.google.api.client.http.javanet.NetHttpTransport;
import com.google.api.client.json.JsonFactory;
import com.google.api.client.json.jackson2.JacksonFactory;
import com.google.api.client.util.store.FileDataStoreFactory;
import com.google.api.services.docs.v1.Docs;
import com.google.api.services.docs.v1.DocsScopes;
import com.google.api.services.docs.v1.model.Document;
import com.google.api.services.docs.v1.model.DocumentTab;
import com.google.api.services.docs.v1.model.ParagraphElement;
import com.google.api.services.docs.v1.model.StructuralElement;
import com.google.api.services.docs.v1.model.Tab;
import com.google.api.services.docs.v1.model.TableCell;
import com.google.api.services.docs.v1.model.TableRow;
import com.google.api.services.docs.v1.model.TextRun;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.security.GeneralSecurityException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class ExtractText {
 
private static final String APPLICATION_NAME = "Google Docs API Extract Guide";
 
private static final JsonFactory JSON_FACTORY = JacksonFactory.getDefaultInstance();
 
private static final String TOKENS_DIRECTORY_PATH = "tokens";
 
private static final String DOCUMENT_ID = "YOUR_DOCUMENT_ID";

 
/**
   * Global instance of the scopes required by this quickstart. If modifying these scopes, delete
   * your previously saved tokens/ folder.
   */

 
private static final List<String> SCOPES =
     
Collections.singletonList(DocsScopes.DOCUMENTS_READONLY);

 
private static final String CREDENTIALS_FILE_PATH = "/credentials.json";

 
/**
   * Creates an authorized Credential object.
   *
   * @param HTTP_TRANSPORT The network HTTP Transport.
   * @return An authorized Credential object.
   * @throws IOException If the credentials.json file cannot be found.
   */

 
private static Credential getCredentials(final NetHttpTransport HTTP_TRANSPORT)
     
throws IOException {
   
// Load client secrets.
   
InputStream in = ExtractText.class.getResourceAsStream(CREDENTIALS_FILE_PATH);
   
GoogleClientSecrets clientSecrets =
       
GoogleClientSecrets.load(JSON_FACTORY, new InputStreamReader(in));

   
// Build flow and trigger user authorization request.
   
GoogleAuthorizationCodeFlow flow =
       
new GoogleAuthorizationCodeFlow.Builder(HTTP_TRANSPORT, JSON_FACTORY, clientSecrets, SCOPES)
           
.setDataStoreFactory(new FileDataStoreFactory(new java.io.File(TOKENS_DIRECTORY_PATH)))
           
.setAccessType("offline")
           
.build();
   
LocalServerReceiver receiver = new LocalServerReceiver.Builder().setPort(8888).build();
   
return new AuthorizationCodeInstalledApp(flow, receiver).authorize("user");
 
}

 
/**
   * Adds the provided tab to the list of all tabs, and recurses through and
   * adds all child tabs.
   */

 
private void addCurrentAndChildTabs(Tab tab, List<Tab> allTabs) {
    allTabs
.add(tab);
   
for (Tab tab: tab.getChildTabs()) {
      addCurrentAndChildTabs
(tab, allTabs);
   
}
 
}

 
/**
   * Returns a flat list of all tabs in the document in the order they would
   * appear in the UI (top-down ordering). Includes all child tabs.
   */

 
private List<Tab> getAllTabs(Document doc) {
   
List<Tab> allTabs = new ArrayList<>();
   
// Iterate over all tabs and recursively add any child tabs to generate a
   
// flat list of Tabs.
   
for (Tab tab: doc.getTabs()) {
      addCurrentAndChildTabs
(tab, allTabs);
   
}
   
return allTabs;
 
}

 
/**
   * Returns the text in the given ParagraphElement.
   *
   * @param element a ParagraphElement from a Google Doc
   */

 
private static String readParagraphElement(ParagraphElement element) {
   
TextRun run = element.getTextRun();
   
if (run == null || run.getContent() == null) {
     
// The TextRun can be null if there is an inline object.
     
return "";
   
}
   
return run.getContent();
 
}

 
/**
   * Recurses through a list of Structural Elements to read a document's text where text may be in
   * nested elements.
   *
   * @param elements a list of Structural Elements
   */

 
private static String readStructuralElements(List<StructuralElement> elements) {
   
StringBuilder sb = new StringBuilder();
   
for (StructuralElement element : elements) {
     
if (element.getParagraph() != null) {
       
for (ParagraphElement paragraphElement : element.getParagraph().getElements()) {
          sb
.append(readParagraphElement(paragraphElement));
       
}
     
} else if (element.getTable() != null) {
       
// The text in table cells are in nested Structural Elements and tables may be
       
// nested.
       
for (TableRow row : element.getTable().getTableRows()) {
         
for (TableCell cell : row.getTableCells()) {
            sb
.append(readStructuralElements(cell.getContent()));
         
}
       
}
     
} else if (element.getTableOfContents() != null) {
       
// The text in the TOC is also in a Structural Element.
        sb
.append(readStructuralElements(element.getTableOfContents().getContent()));
     
}
   
}
   
return sb.toString();
 
}

 
public static void main(String... args) throws IOException, GeneralSecurityException {
   
// Build a new authorized API client service.
   
final NetHttpTransport HTTP_TRANSPORT = GoogleNetHttpTransport.newTrustedTransport();
   
Docs service =
       
new Docs.Builder(HTTP_TRANSPORT, JSON_FACTORY, getCredentials(HTTP_TRANSPORT))
           
.setApplicationName(APPLICATION_NAME)
           
.build();

   
// Fetch the document with all of the tabs populated, including any nested
   
// child tabs.
   
Document doc =
        service
.documents().get(DOCUMENT_ID).setIncludeTabsContent(true).execute();
   
List<Tab> allTabs = getAllTabs(doc);

   
// Print the text from each tab in the document.
   
for (Tab tab: allTabs) {
     
// Get the DocumentTab from the generic Tab.
     
DocumentTab documentTab = tab.getDocumentTab();
     
System.out.println(
          readStructuralElements
(documentTab.getBody().getContent()));
   
}
 
}
}

# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Recursively extracts the text from a Google Doc.
"""

import googleapiclient.discovery as discovery
from httplib2 import Http
from oauth2client import client
from oauth2client import file
from oauth2client import tools

SCOPES
= 'https://www.googleapis.com/auth/documents.readonly'
DISCOVERY_DOC
= 'https://docs.googleapis.com/$discovery/rest?version=v1'
DOCUMENT_ID
= 'YOUR_DOCUMENT_ID'


def get_credentials():
 
"""Gets valid user credentials from storage.

  If nothing has been stored, or if the stored credentials are invalid,
  the OAuth 2.0 flow is completed to obtain the new credentials.

  Returns:
      Credentials, the obtained credential.
  """

  store
= file.Storage('token.json')
  credentials
= store.get()

 
if not credentials or credentials.invalid:
    flow
= client.flow_from_clientsecrets('credentials.json', SCOPES)
    credentials
= tools.run_flow(flow, store)
 
return credentials


def add_current_and_child_tabs(tab, all_tabs):
 
"""Adds the provided tab to the list of all tabs, and recurses through and
  adds all child tabs.

  Args:
      tab: a Tab from a Google Doc.
      all_tabs: a list of all tabs in the document.
  """

  all_tabs
.append(tab)
 
for tab in tab.get('childTabs'):
    add_current_and_child_tabs
(tab, all_tabs)


def get_all_tabs(doc):
 
"""Returns a flat list of all tabs in the document in the order they would
  appear in the UI (top-down ordering). Includes all child tabs.

  Args:
      doc: a document.
  """

  all_tabs
= []
 
# Iterate over all tabs and recursively add any child tabs to generate a
 
# flat list of Tabs.
 
for tab in doc.get('tabs'):
    add_current_and_child_tabs
(tab, all_tabs)
 
return all_tabs


def read_paragraph_element(element):
 
"""Returns the text in the given ParagraphElement.

  Args:
      element: a ParagraphElement from a Google Doc.
  """

  text_run
= element.get('textRun')
 
if not text_run:
   
return ''
 
return text_run.get('content')


def read_structural_elements(elements):
 
"""Recurses through a list of Structural Elements to read a document's text
  where text may be in nested elements.

  Args:
      elements: a list of Structural Elements.
  """

  text
= ''
 
for value in elements:
   
if 'paragraph' in value:
      elements
= value.get('paragraph').get('elements')
     
for elem in elements:
        text
+= read_paragraph_element(elem)
   
elif 'table' in value:
     
# The text in table cells are in nested Structural Elements and tables may
     
# be nested.
      table
= value.get('table')
     
for row in table.get('tableRows'):
        cells
= row.get('tableCells')
       
for cell in cells:
          text
+= read_structural_elements(cell.get('content'))
   
elif 'tableOfContents' in value:
     
# The text in the TOC is also in a Structural Element.
      toc
= value.get('tableOfContents')
      text
+= read_structural_elements(toc.get('content'))
 
return text


def main():
 
"""Uses the Docs API to print out the text of a document."""
  credentials
= get_credentials()
  http
= credentials.authorize(Http())
  docs_service
= discovery.build(
     
'docs', 'v1', http=http, discoveryServiceUrl=DISCOVERY_DOC
 
)
 
# Fetch the document with all of the tabs populated, including any nested
 
# child tabs.
  doc
= (
      docs_service
.documents()
     
.get(documentId=DOCUMENT_ID, include_tabs_content=True)
     
.execute()
 
)
  all_tabs
= get_all_tabs(doc)

 
# Print the text from each tab in the document.
 
for tab in all_tabs:
   
# Get the DocumentTab from the generic Tab.
    document_tab
= tab.get('documentTab')
    doc_content
= document_tab.get('body').get('content')
   
print(read_structural_elements(doc_content))


if __name__ == '__main__':
  main
()