tessdata: uncompress tarball only once to speed up builds

The previous approach was to uncompress N times a big tarball (638 MB)
where N=130 is the number of supported languages.  Each iteration would
only extract a single file, but it still needs to uncompress the whole
tarball.  This is of course completely inefficient.

Now, we uncompress the tarball only once to extract all relevant files,
and then iterate N times to copy the file needed for each language.

This massively speeds up builds, at the expense of temporarily requiring
more build space (about 1 GB more)

Signed-off-by: Baptiste Jonglez <git@bitsofnetworks.org>
This commit is contained in:
Baptiste Jonglez 2021-06-30 14:09:42 +02:00
parent 1204cb82f9
commit 7fe513971f
1 changed files with 8 additions and 2 deletions

View File

@ -18,7 +18,7 @@ PKG_MAINTAINER:=Valentín Kivachuk <vk18496@gmail.com>
PKG_LICENSE:=Apache-2.0 PKG_LICENSE:=Apache-2.0
PKG_LICENSE_FILES:=COPYING PKG_LICENSE_FILES:=COPYING
#No need to extract 1,5GB... #No need to extract 1,5GB... We only extract what we need.
PKG_UNPACK:= PKG_UNPACK:=
include $(INCLUDE_DIR)/package.mk include $(INCLUDE_DIR)/package.mk
@ -26,6 +26,11 @@ include $(INCLUDE_DIR)/package.mk
ALLTESSERACTLANG:=afr amh ara asm aze aze_cyrl bel ben bod bos bre bul cat ceb ces chi_sim chi_sim_vert chi_tra chi_tra_vert chr cos cym dan dan_frak deu deu_frak div dzo ell eng enm epo equ est eus fao fas fil fin fra frk frm fry gla gle glg grc guj hat heb hin hrv hun hye iku ind isl ita ita_old jav jpn jpn_vert kan kat kat_old kaz khm kir kmr kor kor_vert lao lat lav lit ltz mal mar mkd mlt mon mri msa mya nep nld nor oci ori osd pan pol por pus que ron rus san sin slk slk_frak slv snd spa spa_old sqi srp srp_latn sun swa swe syr tam tat tel tgk tgl tha tir ton tur uig ukr urd uzb uzb_cyrl vie yid yor ALLTESSERACTLANG:=afr amh ara asm aze aze_cyrl bel ben bod bos bre bul cat ceb ces chi_sim chi_sim_vert chi_tra chi_tra_vert chr cos cym dan dan_frak deu deu_frak div dzo ell eng enm epo equ est eus fao fas fil fin fra frk frm fry gla gle glg grc guj hat heb hin hrv hun hye iku ind isl ita ita_old jav jpn jpn_vert kan kat kat_old kaz khm kir kmr kor kor_vert lao lat lav lit ltz mal mar mkd mlt mon mri msa mya nep nld nor oci ori osd pan pol por pus que ron rus san sin slk slk_frak slv snd spa spa_old sqi srp srp_latn sun swa swe syr tam tat tel tgk tgl tha tir ton tur uig ukr urd uzb uzb_cyrl vie yid yor
define Build/Prepare
$(Build/Prepare/Default)
$(TAR) --strip=1 -C $(PKG_BUILD_DIR) -xvf $(DL_DIR)/$(PKG_NAME)-$(PKG_VERSION).tar.gz --exclude 'script' --wildcards '*.traineddata'
endef
define Build/Compile define Build/Compile
endef endef
@ -37,6 +42,7 @@ define Package/tesseract-data-default
SECTION:=utils SECTION:=utils
CATEGORY:=Utilities CATEGORY:=Utilities
DEPENDS:=tesseract DEPENDS:=tesseract
PKGARCH:=all
endef endef
define generate-tesseract-data-package define generate-tesseract-data-package
@ -47,7 +53,7 @@ define generate-tesseract-data-package
define Package/tesseract-data-$(1)/install define Package/tesseract-data-$(1)/install
$(INSTALL_DIR) $$(1)/usr/share/tessdata $(INSTALL_DIR) $$(1)/usr/share/tessdata
$(TAR) --strip=1 -C $$(1)/usr/share/tessdata/ -xvf $(DL_DIR)/$(PKG_NAME)-$(PKG_VERSION).tar.gz $(PKG_NAME)-$(PKG_VERSION)/$(1).traineddata $(INSTALL_DATA) $(PKG_BUILD_DIR)/$(1).traineddata $$(1)/usr/share/tessdata/
endef endef
endef endef